summaryrefslogtreecommitdiffstats
path: root/src/core
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 15:35:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 15:35:18 +0000
commitb750101eb236130cf056c675997decbac904cc49 (patch)
treea5df1a06754bdd014cb975c051c83b01c9a97532 /src/core
parentInitial commit. (diff)
downloadsystemd-b750101eb236130cf056c675997decbac904cc49.tar.xz
systemd-b750101eb236130cf056c675997decbac904cc49.zip
Adding upstream version 252.22.upstream/252.22
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/all-units.h15
-rw-r--r--src/core/apparmor-setup.c99
-rw-r--r--src/core/apparmor-setup.h4
-rw-r--r--src/core/audit-fd.c63
-rw-r--r--src/core/audit-fd.h5
-rw-r--r--src/core/automount.c1166
-rw-r--r--src/core/automount.h45
-rw-r--r--src/core/bpf-devices.c531
-rw-r--r--src/core/bpf-devices.h21
-rw-r--r--src/core/bpf-firewall.c969
-rw-r--r--src/core/bpf-firewall.h25
-rw-r--r--src/core/bpf-foreign.c154
-rw-r--r--src/core/bpf-foreign.h15
-rw-r--r--src/core/bpf-lsm.c364
-rw-r--r--src/core/bpf-lsm.h28
-rw-r--r--src/core/bpf-socket-bind.c244
-rw-r--r--src/core/bpf-socket-bind.h15
-rw-r--r--src/core/bpf-util.c35
-rw-r--r--src/core/bpf-util.h5
-rw-r--r--src/core/bpf/meson.build111
-rw-r--r--src/core/bpf/restrict_fs/meson.build24
-rw-r--r--src/core/bpf/restrict_fs/restrict-fs-skel.h14
-rw-r--r--src/core/bpf/restrict_fs/restrict-fs.bpf.c82
-rw-r--r--src/core/bpf/restrict_ifaces/meson.build24
-rw-r--r--src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h14
-rw-r--r--src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c52
-rw-r--r--src/core/bpf/socket_bind/meson.build24
-rw-r--r--src/core/bpf/socket_bind/socket-bind-api.bpf.h51
-rw-r--r--src/core/bpf/socket_bind/socket-bind-skel.h14
-rw-r--r--src/core/bpf/socket_bind/socket-bind.bpf.c111
-rw-r--r--src/core/cgroup.c4272
-rw-r--r--src/core/cgroup.h339
-rw-r--r--src/core/core-varlink.c630
-rw-r--r--src/core/core-varlink.h16
-rw-r--r--src/core/crash-handler.c166
-rw-r--r--src/core/crash-handler.h7
-rw-r--r--src/core/dbus-automount.c68
-rw-r--r--src/core/dbus-automount.h11
-rw-r--r--src/core/dbus-cgroup.c2073
-rw-r--r--src/core/dbus-cgroup.h14
-rw-r--r--src/core/dbus-device.c11
-rw-r--r--src/core/dbus-device.h6
-rw-r--r--src/core/dbus-execute.c3802
-rw-r--r--src/core/dbus-execute.h33
-rw-r--r--src/core/dbus-job.c371
-rw-r--r--src/core/dbus-job.h20
-rw-r--r--src/core/dbus-kill.c81
-rw-r--r--src/core/dbus-kill.h12
-rw-r--r--src/core/dbus-manager.c3440
-rw-r--r--src/core/dbus-manager.h18
-rw-r--r--src/core/dbus-mount.c155
-rw-r--r--src/core/dbus-mount.h12
-rw-r--r--src/core/dbus-path.c165
-rw-r--r--src/core/dbus-path.h11
-rw-r--r--src/core/dbus-scope.c274
-rw-r--r--src/core/dbus-scope.h19
-rw-r--r--src/core/dbus-service.c604
-rw-r--r--src/core/dbus-service.h14
-rw-r--r--src/core/dbus-slice.c34
-rw-r--r--src/core/dbus-slice.h12
-rw-r--r--src/core/dbus-socket.c482
-rw-r--r--src/core/dbus-socket.h12
-rw-r--r--src/core/dbus-swap.c76
-rw-r--r--src/core/dbus-swap.h16
-rw-r--r--src/core/dbus-target.c9
-rw-r--r--src/core/dbus-target.h6
-rw-r--r--src/core/dbus-timer.c376
-rw-r--r--src/core/dbus-timer.h11
-rw-r--r--src/core/dbus-unit.c2579
-rw-r--r--src/core/dbus-unit.h55
-rw-r--r--src/core/dbus-util.c262
-rw-r--r--src/core/dbus-util.h255
-rw-r--r--src/core/dbus.c1247
-rw-r--r--src/core/dbus.h37
-rw-r--r--src/core/device.c1276
-rw-r--r--src/core/device.h44
-rw-r--r--src/core/dynamic-user.c826
-rw-r--r--src/core/dynamic-user.h40
-rw-r--r--src/core/efi-random.c90
-rw-r--r--src/core/efi-random.h4
-rw-r--r--src/core/emergency-action.c171
-rw-r--r--src/core/emergency-action.h36
-rw-r--r--src/core/execute.c7278
-rw-r--r--src/core/execute.h527
-rw-r--r--src/core/fuzz-unit-file.c89
-rw-r--r--src/core/fuzz-unit-file.options2
-rw-r--r--src/core/generator-setup.c58
-rw-r--r--src/core/generator-setup.h8
-rw-r--r--src/core/ima-setup.c92
-rw-r--r--src/core/ima-setup.h9
-rw-r--r--src/core/import-creds.c716
-rw-r--r--src/core/import-creds.h4
-rw-r--r--src/core/job.c1679
-rw-r--r--src/core/job.h251
-rw-r--r--src/core/kill.c57
-rw-r--r--src/core/kill.h56
-rw-r--r--src/core/kmod-setup.c158
-rw-r--r--src/core/kmod-setup.h4
-rw-r--r--src/core/load-dropin.c126
-rw-r--r--src/core/load-dropin.h20
-rw-r--r--src/core/load-fragment-gperf-nulstr.awk16
-rw-r--r--src/core/load-fragment-gperf.gperf.in564
-rw-r--r--src/core/load-fragment.c6505
-rw-r--r--src/core/load-fragment.h156
-rw-r--r--src/core/main.c3096
-rw-r--r--src/core/main.h9
-rw-r--r--src/core/manager-dump.c111
-rw-r--r--src/core/manager-dump.h12
-rw-r--r--src/core/manager-serialize.c596
-rw-r--r--src/core/manager-serialize.h13
-rw-r--r--src/core/manager.c4676
-rw-r--r--src/core/manager.h596
-rw-r--r--src/core/meson.build255
-rw-r--r--src/core/mount.c2344
-rw-r--r--src/core/mount.h102
-rw-r--r--src/core/namespace.c3018
-rw-r--r--src/core/namespace.h184
-rw-r--r--src/core/org.freedesktop.systemd1.conf428
-rw-r--r--src/core/org.freedesktop.systemd1.policy.in83
-rw-r--r--src/core/org.freedesktop.systemd1.service13
-rw-r--r--src/core/path.c1072
-rw-r--r--src/core/path.h89
-rw-r--r--src/core/restrict-ifaces.c200
-rw-r--r--src/core/restrict-ifaces.h16
-rw-r--r--src/core/scope.c859
-rw-r--r--src/core/scope.h52
-rw-r--r--src/core/selinux-access.c289
-rw-r--r--src/core/selinux-access.h14
-rw-r--r--src/core/selinux-setup.c105
-rw-r--r--src/core/selinux-setup.h6
-rw-r--r--src/core/service.c4825
-rw-r--r--src/core/service.h264
-rw-r--r--src/core/show-status.c129
-rw-r--r--src/core/show-status.h44
-rw-r--r--src/core/slice.c471
-rw-r--r--src/core/slice.h18
-rw-r--r--src/core/smack-setup.c390
-rw-r--r--src/core/smack-setup.h10
-rw-r--r--src/core/socket.c3544
-rw-r--r--src/core/socket.h200
-rw-r--r--src/core/swap.c1693
-rw-r--r--src/core/swap.h99
-rw-r--r--src/core/system.conf.in77
-rw-r--r--src/core/systemd.pc.in103
-rw-r--r--src/core/target.c216
-rw-r--r--src/core/target.h16
-rw-r--r--src/core/timer.c1060
-rw-r--r--src/core/timer.h87
-rw-r--r--src/core/transaction.c1200
-rw-r--r--src/core/transaction.h35
-rw-r--r--src/core/unit-dependency-atom.c246
-rw-r--r--src/core/unit-dependency-atom.h89
-rw-r--r--src/core/unit-printf.c263
-rw-r--r--src/core/unit-printf.h26
-rw-r--r--src/core/unit-serialize.c864
-rw-r--r--src/core/unit-serialize.h13
-rw-r--r--src/core/unit.c6111
-rw-r--r--src/core/unit.h1194
-rw-r--r--src/core/user.conf.in50
159 files changed, 88874 insertions, 0 deletions
diff --git a/src/core/all-units.h b/src/core/all-units.h
new file mode 100644
index 0000000..fad814b
--- /dev/null
+++ b/src/core/all-units.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+#include "automount.h"
+#include "device.h"
+#include "path.h"
+#include "scope.h"
+#include "service.h"
+#include "slice.h"
+#include "socket.h"
+#include "swap.h"
+#include "target.h"
+#include "timer.h"
diff --git a/src/core/apparmor-setup.c b/src/core/apparmor-setup.c
new file mode 100644
index 0000000..3426a10
--- /dev/null
+++ b/src/core/apparmor-setup.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#if HAVE_APPARMOR
+# include <sys/apparmor.h>
+#endif
+#include <unistd.h>
+
+#include "apparmor-setup.h"
+#include "apparmor-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+
+#if HAVE_APPARMOR
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_policy_cache *, aa_policy_cache_unref, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_features *, aa_features_unref, NULL);
+#endif
+
+int mac_apparmor_setup(void) {
+#if HAVE_APPARMOR
+ _cleanup_(aa_policy_cache_unrefp) aa_policy_cache *policy_cache = NULL;
+ _cleanup_(aa_features_unrefp) aa_features *features = NULL;
+ _cleanup_free_ char *current_profile = NULL, *cache_dir_path = NULL;
+ int r;
+
+ if (!mac_apparmor_use()) {
+ log_debug("AppArmor either not supported by the kernel or disabled.");
+ return 0;
+ }
+
+ /* To enable LSM stacking a patch to the kernel is proposed to create a
+ * per-LSM subdirectory to distinguish between the LSMs. Therefore, we
+ * read the file from the LSM specific directory first and only if that
+ * fails the one from the generic directory.
+ */
+ FOREACH_STRING(current_file, "/proc/self/attr/apparmor/current", "/proc/self/attr/current") {
+ r = read_one_line_file(current_file, &current_profile);
+ if (r == -ENOENT)
+ continue;
+ else if (r < 0)
+ log_warning_errno(r, "Failed to read current AppArmor profile from file %s, ignoring: %m", current_file);
+ else
+ break;
+ }
+ if (!current_profile) {
+ log_warning("Failed to get the current AppArmor profile of systemd from /proc/self/attr/apparmor/current or /proc/self/attr/current, ignoring.");
+ return 0;
+ }
+ if (!streq(current_profile, "unconfined")) {
+ log_debug("We are already confined in an AppArmor profile.");
+ return 0;
+ }
+
+ r = aa_features_new_from_kernel(&features);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to get the AppArmor feature set from the kernel, ignoring: %m");
+ return 0;
+ }
+ cache_dir_path = aa_policy_cache_dir_path_preview(features, AT_FDCWD, "/etc/apparmor/earlypolicy");
+ if (!cache_dir_path) {
+ log_debug_errno(errno, "Failed to get the path of the early AppArmor policy cache directory.");
+ return 0;
+ }
+
+ /* aa_policy_cache_new will internally use the same path as aa_policy_cache_dir_path_preview has returned. */
+ r = aa_policy_cache_new(&policy_cache, features, AT_FDCWD, "/etc/apparmor/earlypolicy", 0);
+ if (r < 0) {
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "The early AppArmor policy cache directory %s does not exist.", cache_dir_path);
+ return 0;
+ }
+ log_warning_errno(errno, "Failed to create a new AppArmor policy cache, ignoring: %m");
+ return 0;
+ }
+ r = aa_policy_cache_replace_all(policy_cache, NULL);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to load the profiles from the early AppArmor policy cache directory %s, ignoring: %m", cache_dir_path);
+ return 0;
+ }
+
+ log_info("Successfully loaded all binary profiles from AppArmor early policy cache at %s.", cache_dir_path);
+
+ r = aa_change_profile("systemd");
+ if (r < 0) {
+ if (errno == ENOENT)
+ log_debug_errno(errno, "Failed to change to AppArmor profile 'systemd'. Please ensure that one of the binary profile files in policy cache directory %s contains a profile with that name.", cache_dir_path);
+ else
+ log_error_errno(errno, "Failed to change to AppArmor profile 'systemd': %m");
+ return 0;
+ }
+
+ log_info("Changed to AppArmor profile systemd.");
+#endif
+ return 0;
+}
diff --git a/src/core/apparmor-setup.h b/src/core/apparmor-setup.h
new file mode 100644
index 0000000..f3b7382
--- /dev/null
+++ b/src/core/apparmor-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int mac_apparmor_setup(void);
diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c
new file mode 100644
index 0000000..097bea3
--- /dev/null
+++ b/src/core/audit-fd.c
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "audit-fd.h"
+
+#if HAVE_AUDIT
+
+#include <libaudit.h>
+#include <stdbool.h>
+
+#include "capability-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "util.h"
+
+static bool initialized = false;
+static int audit_fd;
+
+int get_audit_fd(void) {
+
+ if (!initialized) {
+ if (have_effective_cap(CAP_AUDIT_WRITE) == 0) {
+ audit_fd = -EPERM;
+ initialized = true;
+
+ return audit_fd;
+ }
+
+ audit_fd = audit_open();
+
+ if (audit_fd < 0) {
+ if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT))
+ log_error_errno(errno, "Failed to connect to audit log: %m");
+
+ audit_fd = errno ? -errno : -EINVAL;
+ }
+
+ initialized = true;
+ }
+
+ return audit_fd;
+}
+
+void close_audit_fd(void) {
+
+ if (initialized && audit_fd >= 0)
+ safe_close(audit_fd);
+
+ initialized = true;
+ audit_fd = -ECONNRESET;
+}
+
+#else
+
+int get_audit_fd(void) {
+ return -EAFNOSUPPORT;
+}
+
+void close_audit_fd(void) {
+}
+
+#endif
diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h
new file mode 100644
index 0000000..5cdf61e
--- /dev/null
+++ b/src/core/audit-fd.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int get_audit_fd(void);
+void close_audit_fd(void);
diff --git a/src/core/automount.c b/src/core/automount.c
new file mode 100644
index 0000000..a44b8e8
--- /dev/null
+++ b/src/core/automount.c
@@ -0,0 +1,1166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/auto_dev-ioctl.h>
+#include <linux/auto_fs4.h>
+#include <sys/epoll.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "async.h"
+#include "automount.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "io-util.h"
+#include "label.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
+ [AUTOMOUNT_DEAD] = UNIT_INACTIVE,
+ [AUTOMOUNT_WAITING] = UNIT_ACTIVE,
+ [AUTOMOUNT_RUNNING] = UNIT_ACTIVE,
+ [AUTOMOUNT_FAILED] = UNIT_FAILED
+};
+
+struct expire_data {
+ int dev_autofs_fd;
+ int ioctl_fd;
+};
+
+static struct expire_data* expire_data_free(struct expire_data *data) {
+ if (!data)
+ return NULL;
+
+ safe_close(data->dev_autofs_fd);
+ safe_close(data->ioctl_fd);
+ return mfree(data);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct expire_data*, expire_data_free);
+
+static int open_dev_autofs(Manager *m);
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
+static int automount_start_expire(Automount *a);
+static void automount_stop_expire(Automount *a);
+static int automount_send_ready(Automount *a, Set *tokens, int status);
+
+static void automount_init(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ a->pipe_fd = -1;
+ a->directory_mode = 0755;
+ UNIT(a)->ignore_on_isolate = true;
+}
+
+static void unmount_autofs(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (a->pipe_fd < 0)
+ return;
+
+ a->pipe_event_source = sd_event_source_disable_unref(a->pipe_event_source);
+ a->pipe_fd = safe_close(a->pipe_fd);
+
+ /* If we reload/reexecute things we keep the mount point around */
+ if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) {
+
+ automount_send_ready(a, a->tokens, -EHOSTDOWN);
+ automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+
+ if (a->where) {
+ r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ log_error_errno(r, "Failed to unmount: %m");
+ }
+ }
+}
+
+static void automount_done(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ unmount_autofs(a);
+
+ a->where = mfree(a->where);
+ a->extra_options = mfree(a->extra_options);
+
+ a->tokens = set_free(a->tokens);
+ a->expire_tokens = set_free(a->expire_tokens);
+
+ a->expire_event_source = sd_event_source_disable_unref(a->expire_event_source);
+}
+
+static int automount_add_trigger_dependencies(Automount *a) {
+ Unit *x;
+ int r;
+
+ assert(a);
+
+ r = unit_load_related_unit(UNIT(a), ".mount", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_mount_dependencies(Automount *a) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(a);
+
+ r = path_extract_directory(a->where, &parent);
+ if (r < 0)
+ return r;
+
+ return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_default_dependencies(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (!UNIT(a)->default_dependencies)
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(UNIT(a)->manager))
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(a), UNIT_BEFORE, SPECIAL_LOCAL_FS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ r = unit_add_dependency_by_name(UNIT(a), UNIT_AFTER, SPECIAL_LOCAL_FS_PRE_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int automount_verify(Automount *a) {
+ static const char *const reserved_options[] = {
+ "fd\0",
+ "pgrp\0",
+ "minproto\0",
+ "maxproto\0",
+ "direct\0",
+ "indirect\0",
+ };
+
+ _cleanup_free_ char *e = NULL;
+ int r;
+
+ assert(a);
+ assert(UNIT(a)->load_state == UNIT_LOADED);
+
+ if (path_equal(a->where, "/"))
+ return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Cannot have an automount unit for the root directory. Refusing.");
+
+ r = unit_name_from_path(a->where, ".automount", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m");
+
+ if (!unit_has_name(UNIT(a), e))
+ return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+ for (size_t i = 0; i < ELEMENTSOF(reserved_options); i++)
+ if (fstab_test_option(a->extra_options, reserved_options[i]))
+ return log_unit_error_errno(
+ UNIT(a),
+ SYNTHETIC_ERRNO(ENOEXEC),
+ "ExtraOptions= setting may not contain reserved option %s.",
+ reserved_options[i]);
+
+ return 0;
+}
+
+static int automount_set_where(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (a->where)
+ return 0;
+
+ r = unit_name_to_path(UNIT(a)->id, &a->where);
+ if (r < 0)
+ return r;
+
+ path_simplify(a->where);
+ return 1;
+}
+
+static int automount_add_extras(Automount *a) {
+ int r;
+
+ r = automount_set_where(a);
+ if (r < 0)
+ return r;
+
+ r = automount_add_trigger_dependencies(a);
+ if (r < 0)
+ return r;
+
+ r = automount_add_mount_dependencies(a);
+ if (r < 0)
+ return r;
+
+ return automount_add_default_dependencies(a);
+}
+
+static int automount_load(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ /* Load a .automount file */
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = automount_add_extras(a);
+ if (r < 0)
+ return r;
+
+ return automount_verify(a);
+}
+
+static void automount_set_state(Automount *a, AutomountState state) {
+ AutomountState old_state;
+ assert(a);
+
+ if (a->state != state)
+ bus_unit_send_pending_change_signal(UNIT(a), false);
+
+ old_state = a->state;
+ a->state = state;
+
+ if (state != AUTOMOUNT_RUNNING)
+ automount_stop_expire(a);
+
+ if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+ unmount_autofs(a);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state));
+
+ unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static int automount_coldplug(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(a->state == AUTOMOUNT_DEAD);
+
+ if (a->deserialized_state == a->state)
+ return 0;
+
+ if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) {
+
+ r = automount_set_where(a);
+ if (r < 0)
+ return r;
+
+ r = open_dev_autofs(u->manager);
+ if (r < 0)
+ return r;
+
+ assert(a->pipe_fd >= 0);
+
+ r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+ if (a->deserialized_state == AUTOMOUNT_RUNNING) {
+ r = automount_start_expire(a);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+ }
+
+ automount_set_state(a, a->deserialized_state);
+ }
+
+ return 0;
+}
+
+static void automount_dump(Unit *u, FILE *f, const char *prefix) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ fprintf(f,
+ "%sAutomount State: %s\n"
+ "%sResult: %s\n"
+ "%sWhere: %s\n"
+ "%sExtraOptions: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sTimeoutIdleUSec: %s\n",
+ prefix, automount_state_to_string(a->state),
+ prefix, automount_result_to_string(a->result),
+ prefix, a->where,
+ prefix, a->extra_options,
+ prefix, a->directory_mode,
+ prefix, FORMAT_TIMESPAN(a->timeout_idle_usec, USEC_PER_SEC));
+}
+
+static void automount_enter_dead(Automount *a, AutomountResult f) {
+ assert(a);
+
+ if (a->result == AUTOMOUNT_SUCCESS)
+ a->result = f;
+
+ unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result));
+ automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD);
+}
+
+static int open_dev_autofs(Manager *m) {
+ struct autofs_dev_ioctl param;
+
+ assert(m);
+
+ if (m->dev_autofs_fd >= 0)
+ return m->dev_autofs_fd;
+
+ (void) label_fix("/dev/autofs", 0);
+
+ m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY);
+ if (m->dev_autofs_fd < 0)
+ return log_error_errno(errno, "Failed to open /dev/autofs: %m");
+
+ init_autofs_dev_ioctl(&param);
+ if (ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, &param) < 0) {
+ m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+ return -errno;
+ }
+
+ log_debug("Autofs kernel version %u.%u", param.ver_major, param.ver_minor);
+
+ return m->dev_autofs_fd;
+}
+
+static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) {
+ struct autofs_dev_ioctl *param;
+ size_t l;
+
+ assert(dev_autofs_fd >= 0);
+ assert(where);
+
+ l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1;
+ param = alloca_safe(l);
+
+ init_autofs_dev_ioctl(param);
+ param->size = l;
+ param->ioctlfd = -1;
+ param->openmount.devid = devid;
+ strcpy(param->path, where);
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0)
+ return -errno;
+
+ if (param->ioctlfd < 0)
+ return -EIO;
+
+ (void) fd_cloexec(param->ioctlfd, true);
+ return param->ioctlfd;
+}
+
+static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) {
+ uint32_t major, minor;
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, &param) < 0)
+ return -errno;
+
+ major = param.protover.version;
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, &param) < 0)
+ return -errno;
+
+ minor = param.protosubver.sub_version;
+
+ log_debug("Autofs protocol version %u.%u", major, minor);
+ return 0;
+}
+
+static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) {
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (usec == USEC_INFINITY)
+ param.timeout.timeout = 0;
+ else
+ /* Convert to seconds, rounding up. */
+ param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC);
+
+ return RET_NERRNO(ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, &param));
+}
+
+static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) {
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (status != 0) {
+ param.fail.token = token;
+ param.fail.status = status;
+ } else
+ param.ready.token = token;
+
+ return RET_NERRNO(ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, &param));
+}
+
+static int automount_send_ready(Automount *a, Set *tokens, int status) {
+ _cleanup_close_ int ioctl_fd = -1;
+ unsigned token;
+ int r;
+
+ assert(a);
+ assert(status <= 0);
+
+ if (set_isempty(tokens))
+ return 0;
+
+ ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+ if (ioctl_fd < 0)
+ return ioctl_fd;
+
+ if (status != 0)
+ log_unit_debug_errno(UNIT(a), status, "Sending failure: %m");
+ else
+ log_unit_debug(UNIT(a), "Sending success.");
+
+ r = 0;
+
+ /* Autofs thankfully does not hand out 0 as a token */
+ while ((token = PTR_TO_UINT(set_steal_first(tokens)))) {
+ int k;
+
+ /* Autofs fun fact:
+ *
+ * if you pass a positive status code here, kernels
+ * prior to 4.12 will freeze! Yay! */
+
+ k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd,
+ ioctl_fd,
+ token,
+ status);
+ if (k < 0)
+ r = k;
+ }
+
+ return r;
+}
+
+static void automount_trigger_notify(Unit *u, Unit *other) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+ assert(other->type == UNIT_MOUNT);
+
+ /* Don't propagate state changes from the mount if we are already down */
+ if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ /* The mount is successfully established */
+ if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) {
+ (void) automount_send_ready(a, a->tokens, 0);
+
+ r = automount_start_expire(a);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+
+ automount_set_state(a, AUTOMOUNT_RUNNING);
+ }
+
+ if (IN_SET(MOUNT(other)->state,
+ MOUNT_MOUNTING, MOUNT_MOUNTING_DONE,
+ MOUNT_MOUNTED, MOUNT_REMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_FAILED))
+ (void) automount_send_ready(a, a->expire_tokens, -ENODEV);
+
+ if (MOUNT(other)->state == MOUNT_DEAD)
+ (void) automount_send_ready(a, a->expire_tokens, 0);
+
+ /* The mount is in some unhappy state now, let's unfreeze any waiting clients */
+ if (IN_SET(MOUNT(other)->state,
+ MOUNT_DEAD, MOUNT_UNMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_FAILED)) {
+
+ (void) automount_send_ready(a, a->tokens, -ENODEV);
+
+ automount_set_state(a, AUTOMOUNT_WAITING);
+ }
+}
+
+static void automount_enter_waiting(Automount *a) {
+ _cleanup_close_ int ioctl_fd = -1;
+ int p[2] = { -1, -1 };
+ char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1];
+ _cleanup_free_ char *options = NULL;
+ bool mounted = false;
+ int r, dev_autofs_fd;
+ struct stat st;
+
+ assert(a);
+ assert(a->pipe_fd < 0);
+ assert(a->where);
+
+ set_clear(a->tokens);
+
+ r = unit_fail_if_noncanonical(UNIT(a), a->where);
+ if (r < 0)
+ goto fail;
+
+ (void) mkdir_p_label(a->where, a->directory_mode);
+
+ unit_warn_if_dir_nonempty(UNIT(a), a->where);
+
+ dev_autofs_fd = open_dev_autofs(UNIT(a)->manager);
+ if (dev_autofs_fd < 0) {
+ r = dev_autofs_fd;
+ goto fail;
+ }
+
+ if (pipe2(p, O_CLOEXEC) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ r = fd_nonblock(p[0], true);
+ if (r < 0)
+ goto fail;
+
+ if (asprintf(
+ &options,
+ "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct%s%s",
+ p[1],
+ getpgrp(),
+ isempty(a->extra_options) ? "" : ",",
+ strempty(a->extra_options)) < 0) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ xsprintf(name, "systemd-"PID_FMT, getpid_cached());
+ r = mount_nofollow(name, a->where, "autofs", 0, options);
+ if (r < 0)
+ goto fail;
+
+ mounted = true;
+
+ p[1] = safe_close(p[1]);
+
+ if (stat(a->where, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev);
+ if (ioctl_fd < 0) {
+ r = ioctl_fd;
+ goto fail;
+ }
+
+ r = autofs_protocol(dev_autofs_fd, ioctl_fd);
+ if (r < 0)
+ goto fail;
+
+ r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, p[0], EPOLLIN, automount_dispatch_io, a);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+
+ a->pipe_fd = p[0];
+ a->dev_id = st.st_dev;
+
+ automount_set_state(a, AUTOMOUNT_WAITING);
+
+ return;
+
+fail:
+ log_unit_error_errno(UNIT(a), r, "Failed to initialize automounter: %m");
+
+ safe_close_pair(p);
+
+ if (mounted) {
+ r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ log_error_errno(r, "Failed to unmount, ignoring: %m");
+ }
+
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static void *expire_thread(void *p) {
+ struct autofs_dev_ioctl param;
+ _cleanup_(expire_data_freep) struct expire_data *data = p;
+ int r;
+
+ assert(data->dev_autofs_fd >= 0);
+ assert(data->ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = data->ioctl_fd;
+
+ do {
+ r = ioctl(data->dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param);
+ } while (r >= 0);
+
+ if (errno != EAGAIN)
+ log_warning_errno(errno, "Failed to expire automount, ignoring: %m");
+
+ return NULL;
+}
+
+static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
+ Automount *a = AUTOMOUNT(userdata);
+ _cleanup_(expire_data_freep) struct expire_data *data = NULL;
+ int r;
+
+ assert(a);
+ assert(source == a->expire_event_source);
+
+ data = new0(struct expire_data, 1);
+ if (!data)
+ return log_oom();
+
+ data->ioctl_fd = -1;
+
+ data->dev_autofs_fd = fcntl(UNIT(a)->manager->dev_autofs_fd, F_DUPFD_CLOEXEC, 3);
+ if (data->dev_autofs_fd < 0)
+ return log_unit_error_errno(UNIT(a), errno, "Failed to duplicate autofs fd: %m");
+
+ data->ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+ if (data->ioctl_fd < 0)
+ return log_unit_error_errno(UNIT(a), data->ioctl_fd, "Couldn't open autofs ioctl fd: %m");
+
+ r = asynchronous_job(expire_thread, data);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");
+
+ data = NULL;
+
+ return automount_start_expire(a);
+}
+
+static int automount_start_expire(Automount *a) {
+ usec_t timeout;
+ int r;
+
+ assert(a);
+
+ if (a->timeout_idle_usec == 0)
+ return 0;
+
+ timeout = MAX(a->timeout_idle_usec/3, USEC_PER_SEC);
+
+ if (a->expire_event_source) {
+ r = sd_event_source_set_time_relative(a->expire_event_source, timeout);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT);
+ }
+
+ r = sd_event_add_time_relative(
+ UNIT(a)->manager->event,
+ &a->expire_event_source,
+ CLOCK_MONOTONIC, timeout, 0,
+ automount_dispatch_expire, a);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(a->expire_event_source, "automount-expire");
+
+ return 0;
+}
+
+static void automount_stop_expire(Automount *a) {
+ assert(a);
+
+ if (!a->expire_event_source)
+ return;
+
+ (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF);
+}
+
+static void automount_enter_running(Automount *a) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ struct stat st;
+ int r;
+
+ assert(a);
+
+ /* If the user masked our unit in the meantime, fail */
+ if (UNIT(a)->load_state != UNIT_LOADED) {
+ log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded.");
+ goto fail;
+ }
+
+ /* We don't take mount requests anymore if we are supposed to
+ * shut down anyway */
+ if (unit_stop_pending(UNIT(a))) {
+ log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled.");
+ automount_send_ready(a, a->tokens, -EHOSTDOWN);
+ automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+ return;
+ }
+
+ (void) mkdir_p_label(a->where, a->directory_mode);
+
+ /* Before we do anything, let's see if somebody is playing games with us? */
+ if (lstat(a->where, &st) < 0) {
+ log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m");
+ goto fail;
+ }
+
+ /* The mount unit may have been explicitly started before we got the
+ * autofs request. Ack it to unblock anything waiting on the mount point. */
+ if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) {
+ log_unit_info(UNIT(a), "Automount point already active?");
+ automount_send_ready(a, a->tokens, 0);
+ return;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(a));
+ if (!trigger) {
+ log_unit_error(UNIT(a), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+
+ automount_set_state(a, AUTOMOUNT_RUNNING);
+ return;
+
+fail:
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static int automount_start(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED));
+
+ if (path_is_mount_point(a->where, NULL, 0) > 0)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where);
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ a->result = AUTOMOUNT_SUCCESS;
+ automount_enter_waiting(a);
+ return 1;
+}
+
+static int automount_stop(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING));
+
+ automount_enter_dead(a, AUTOMOUNT_SUCCESS);
+ return 1;
+}
+
+static int automount_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Automount *a = AUTOMOUNT(u);
+ void *p;
+ int r;
+
+ assert(a);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", automount_state_to_string(a->state));
+ (void) serialize_item(f, "result", automount_result_to_string(a->result));
+ (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id);
+
+ SET_FOREACH(p, a->tokens)
+ (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p));
+ SET_FOREACH(p, a->expire_tokens)
+ (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p));
+
+ r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ AutomountState state;
+
+ state = automount_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ a->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ AutomountResult f;
+
+ f = automount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != AUTOMOUNT_SUCCESS)
+ a->result = f;
+
+ } else if (streq(key, "dev-id")) {
+ unsigned long d;
+
+ if (safe_atolu(value, &d) < 0)
+ log_unit_debug(u, "Failed to parse dev-id value: %s", value);
+ else
+ a->dev_id = (dev_t) d;
+
+ } else if (streq(key, "token")) {
+ unsigned token;
+
+ if (safe_atou(value, &token) < 0)
+ log_unit_debug(u, "Failed to parse token value: %s", value);
+ else {
+ r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(token));
+ if (r < 0)
+ log_unit_error_errno(u, r, "Failed to add token to set: %m");
+ }
+ } else if (streq(key, "expire-token")) {
+ unsigned token;
+
+ if (safe_atou(value, &token) < 0)
+ log_unit_debug(u, "Failed to parse token value: %s", value);
+ else {
+ r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(token));
+ if (r < 0)
+ log_unit_error_errno(u, r, "Failed to add expire token to set: %m");
+ }
+ } else if (streq(key, "pipe-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse pipe-fd value: %s", value);
+ else {
+ safe_close(a->pipe_fd);
+ a->pipe_fd = fdset_remove(fds, fd);
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState automount_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[AUTOMOUNT(u)->state];
+}
+
+static const char *automount_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return automount_state_to_string(AUTOMOUNT(u)->state);
+}
+
+static bool automount_may_gc(Unit *u) {
+ Unit *t;
+
+ assert(u);
+
+ t = UNIT_TRIGGER(u);
+ if (!t)
+ return true;
+
+ return UNIT_VTABLE(t)->may_gc(t);
+}
+
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ union autofs_v5_packet_union packet;
+ Automount *a = AUTOMOUNT(userdata);
+ Unit *trigger;
+ int r;
+
+ assert(a);
+ assert(fd == a->pipe_fd);
+
+ if (events & (EPOLLHUP|EPOLLERR)) {
+ log_unit_error(UNIT(a), "Got hangup/error on autofs pipe from kernel. Likely our automount point has been unmounted by someone or something else?");
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_UNMOUNTED);
+ return 0;
+ }
+
+ if (events != EPOLLIN) {
+ log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd);
+ goto fail;
+ }
+
+ r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m");
+ goto fail;
+ }
+
+ switch (packet.hdr.type) {
+
+ case autofs_ptype_missing_direct:
+
+ if (packet.v5_packet.pid > 0) {
+ _cleanup_free_ char *p = NULL;
+
+ (void) get_process_comm(packet.v5_packet.pid, &p);
+ log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p));
+ } else
+ log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where);
+
+ r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+ goto fail;
+ }
+
+ automount_enter_running(a);
+ break;
+
+ case autofs_ptype_expire_direct:
+ log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where);
+
+ automount_stop_expire(a);
+
+ r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+ goto fail;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(a));
+ if (!trigger) {
+ log_unit_error(UNIT(a), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ log_unit_warning(UNIT(a), "Failed to queue unmount job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+ break;
+
+ default:
+ log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type);
+ break;
+ }
+
+ return 0;
+
+fail:
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void automount_shutdown(Manager *m) {
+ assert(m);
+
+ m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+}
+
+static void automount_reset_failed(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ if (a->state == AUTOMOUNT_FAILED)
+ automount_set_state(a, AUTOMOUNT_DEAD);
+
+ a->result = AUTOMOUNT_SUCCESS;
+}
+
+static bool automount_supported(void) {
+ static int supported = -1;
+
+ if (supported < 0)
+ supported = access("/dev/autofs", F_OK) >= 0;
+
+ return supported;
+}
+
+static int automount_can_start(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = {
+ [AUTOMOUNT_SUCCESS] = "success",
+ [AUTOMOUNT_FAILURE_RESOURCES] = "resources",
+ [AUTOMOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit",
+ [AUTOMOUNT_FAILURE_UNMOUNTED] = "unmounted",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult);
+
+const UnitVTable automount_vtable = {
+ .object_size = sizeof(Automount),
+
+ .sections =
+ "Unit\0"
+ "Automount\0"
+ "Install\0",
+ .private_section = "Automount",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+ .exclude_from_switch_root_serialization = true,
+
+ .init = automount_init,
+ .load = automount_load,
+ .done = automount_done,
+
+ .coldplug = automount_coldplug,
+
+ .dump = automount_dump,
+
+ .start = automount_start,
+ .stop = automount_stop,
+
+ .serialize = automount_serialize,
+ .deserialize_item = automount_deserialize_item,
+
+ .active_state = automount_active_state,
+ .sub_state_to_string = automount_sub_state_to_string,
+
+ .may_gc = automount_may_gc,
+
+ .trigger_notify = automount_trigger_notify,
+
+ .reset_failed = automount_reset_failed,
+
+ .bus_set_property = bus_automount_set_property,
+
+ .shutdown = automount_shutdown,
+ .supported = automount_supported,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Set up automount %s.",
+ [JOB_FAILED] = "Failed to set up automount %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Unset automount %s.",
+ [JOB_FAILED] = "Failed to unset automount %s.",
+ },
+ },
+
+ .can_start = automount_can_start,
+};
diff --git a/src/core/automount.h b/src/core/automount.h
new file mode 100644
index 0000000..e413f23
--- /dev/null
+++ b/src/core/automount.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Automount Automount;
+
+#include "unit.h"
+
+typedef enum AutomountResult {
+ AUTOMOUNT_SUCCESS,
+ AUTOMOUNT_FAILURE_RESOURCES,
+ AUTOMOUNT_FAILURE_UNMOUNTED,
+ AUTOMOUNT_FAILURE_START_LIMIT_HIT,
+ AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT,
+ _AUTOMOUNT_RESULT_MAX,
+ _AUTOMOUNT_RESULT_INVALID = -EINVAL,
+} AutomountResult;
+
+struct Automount {
+ Unit meta;
+
+ AutomountState state, deserialized_state;
+
+ char *where;
+ char *extra_options;
+ usec_t timeout_idle_usec;
+
+ int pipe_fd;
+ sd_event_source *pipe_event_source;
+ mode_t directory_mode;
+ dev_t dev_id;
+
+ Set *tokens;
+ Set *expire_tokens;
+
+ sd_event_source *expire_event_source;
+
+ AutomountResult result;
+};
+
+extern const UnitVTable automount_vtable;
+
+const char* automount_result_to_string(AutomountResult i) _const_;
+AutomountResult automount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(AUTOMOUNT, Automount);
diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c
new file mode 100644
index 0000000..3af9e78
--- /dev/null
+++ b/src/core/bpf-devices.c
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fnmatch.h>
+#include <linux/bpf_insn.h>
+
+#include "bpf-devices.h"
+#include "bpf-program.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+#define PASS_JUMP_OFF 4096
+
+static int bpf_access_type(const char *acc) {
+ int r = 0;
+
+ assert(acc);
+
+ for (; *acc; acc++)
+ switch (*acc) {
+ case 'r':
+ r |= BPF_DEVCG_ACC_READ;
+ break;
+ case 'w':
+ r |= BPF_DEVCG_ACC_WRITE;
+ break;
+ case 'm':
+ r |= BPF_DEVCG_ACC_MKNOD;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return r;
+}
+
+static int bpf_prog_allow_list_device(
+ BPFProgram *prog,
+ char type,
+ int major,
+ int minor,
+ const char *acc) {
+
+ int r, access;
+
+ assert(prog);
+ assert(acc);
+
+ log_trace("%s: %c %d:%d %s", __func__, type, major, minor, acc);
+
+ access = bpf_access_type(acc);
+ if (access <= 0)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3), /* compare device type */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD))
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+static int bpf_prog_allow_list_major(
+ BPFProgram *prog,
+ char type,
+ int major,
+ const char *acc) {
+
+ int r, access;
+
+ assert(prog);
+ assert(acc);
+
+ log_trace("%s: %c %d:* %s", __func__, type, major, acc);
+
+ access = bpf_access_type(acc);
+ if (access <= 0)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2), /* compare device type */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD))
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+static int bpf_prog_allow_list_class(
+ BPFProgram *prog,
+ char type,
+ const char *acc) {
+
+ int r, access;
+
+ assert(prog);
+ assert(acc);
+
+ log_trace("%s: %c *:* %s", __func__, type, acc);
+
+ access = bpf_access_type(acc);
+ if (access <= 0)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD))
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+int bpf_devices_cgroup_init(
+ BPFProgram **ret,
+ CGroupDevicePolicy policy,
+ bool allow_list) {
+
+ const struct bpf_insn pre_insn[] = {
+ /* load device type to r2 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+ /* load access type to r3 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+ /* load major number to r4 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+ /* load minor number to r5 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, minor)),
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ int r;
+
+ assert(ret);
+
+ if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list)
+ return 0;
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog);
+ if (r < 0)
+ return log_error_errno(r, "Loading device control BPF program failed: %m");
+
+ if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) {
+ r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+ }
+
+ *ret = TAKE_PTR(prog);
+
+ return 0;
+}
+
+int bpf_devices_apply_policy(
+ BPFProgram **prog,
+ CGroupDevicePolicy policy,
+ bool allow_list,
+ const char *cgroup_path,
+ BPFProgram **prog_installed) {
+
+ _cleanup_free_ char *controller_path = NULL;
+ int r;
+
+ /* This will assign *prog_installed if everything goes well. */
+
+ assert(prog);
+ if (!*prog)
+ goto finish;
+
+ const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list;
+
+ const struct bpf_insn post_insn[] = {
+ /* return DENY */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+ };
+
+ const struct bpf_insn exit_insn[] = {
+ /* finally return DENY if deny_everything else ALLOW */
+ BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1),
+ BPF_EXIT_INSN()
+ };
+
+ if (!deny_everything) {
+ r = bpf_program_add_instructions(*prog, post_insn, ELEMENTSOF(post_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ /* Fixup PASS_JUMP_OFF jump offsets. */
+ for (size_t off = 0; off < (*prog)->n_instructions; off++) {
+ struct bpf_insn *ins = &((*prog)->instructions[off]);
+
+ if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF)
+ ins->off = (*prog)->n_instructions - off - 1;
+ }
+ }
+
+ r = bpf_program_add_instructions(*prog, exit_insn, ELEMENTSOF(exit_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine cgroup path: %m");
+
+ r = bpf_program_cgroup_attach(*prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI);
+ if (r < 0)
+ return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m",
+ empty_to_root(cgroup_path));
+
+ finish:
+ /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */
+ if (prog_installed) {
+ bpf_program_free(*prog_installed);
+ *prog_installed = TAKE_PTR(*prog);
+ }
+ return 0;
+}
+
+int bpf_devices_supported(void) {
+ const struct bpf_insn trivial[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN()
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+ static int supported = -1;
+ int r;
+
+ /* Checks whether BPF device controller is supported. For this, we check five things:
+ *
+ * a) whether we are privileged
+ * b) whether the unified hierarchy is being used
+ * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
+ */
+
+ if (supported >= 0)
+ return supported;
+
+ if (geteuid() != 0) {
+ log_debug("Not enough privileges, BPF device control is not supported.");
+ return supported = 0;
+ }
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+ if (r == 0) {
+ log_debug("Not running with unified cgroups, BPF device control is not supported.");
+ return supported = 0;
+ }
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program);
+ if (r < 0) {
+ log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+ if (r < 0) {
+ log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ r = bpf_program_load_kernel(program, NULL, 0);
+ if (r < 0) {
+ log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ return supported = 1;
+}
+
+static int allow_list_device_pattern(
+ BPFProgram *prog,
+ const char *path,
+ char type,
+ const unsigned *maj,
+ const unsigned *min,
+ const char *acc) {
+
+ assert(IN_SET(type, 'b', 'c'));
+
+ if (cg_all_unified() > 0) {
+ if (!prog)
+ return 0;
+
+ if (maj && min)
+ return bpf_prog_allow_list_device(prog, type, *maj, *min, acc);
+ else if (maj)
+ return bpf_prog_allow_list_major(prog, type, *maj, acc);
+ else
+ return bpf_prog_allow_list_class(prog, type, acc);
+
+ } else {
+ char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4];
+ int r;
+
+ if (maj && min)
+ xsprintf(buf, "%c %u:%u %s", type, *maj, *min, acc);
+ else if (maj)
+ xsprintf(buf, "%c %u:* %s", type, *maj, acc);
+ else
+ xsprintf(buf, "%c *:* %s", type, acc);
+
+ /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+ * EINVAL here. */
+
+ r = cg_set_attribute("devices", path, "devices.allow", buf);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to set devices.allow on %s: %m", path);
+
+ return r;
+ }
+}
+
+int bpf_devices_allow_list_device(
+ BPFProgram *prog,
+ const char *path,
+ const char *node,
+ const char *acc) {
+
+ mode_t mode;
+ dev_t rdev;
+ int r;
+
+ assert(path);
+ assert(acc);
+ assert(strlen(acc) <= 3);
+
+ log_trace("%s: %s %s", __func__, node, acc);
+
+ /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
+ * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
+ * means clients can use these path without the device node actually around */
+ r = device_path_parse_major_minor(node, &mode, &rdev);
+ if (r < 0) {
+ if (r != -ENODEV)
+ return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
+
+ struct stat st;
+ if (stat(node, &st) < 0)
+ return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
+
+ if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
+ return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node);
+
+ mode = st.st_mode;
+ rdev = (dev_t) st.st_rdev;
+ }
+
+ unsigned maj = major(rdev), min = minor(rdev);
+ return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, acc);
+}
+
+int bpf_devices_allow_list_major(
+ BPFProgram *prog,
+ const char *path,
+ const char *name,
+ char type,
+ const char *acc) {
+
+ unsigned maj;
+ int r;
+
+ assert(path);
+ assert(acc);
+ assert(IN_SET(type, 'b', 'c'));
+
+ if (streq(name, "*"))
+ /* If the name is a wildcard, then apply this list to all devices of this type */
+ return allow_list_device_pattern(prog, path, type, NULL, NULL, acc);
+
+ if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj))
+ /* The name is numeric and suitable as major. In that case, let's take its major, and create
+ * the entry directly. */
+ return allow_list_device_pattern(prog, path, type, &maj, NULL, acc);
+
+ _cleanup_fclose_ FILE *f = NULL;
+ bool good = false, any = false;
+
+ f = fopen("/proc/devices", "re");
+ if (!f)
+ return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *w, *p;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read /proc/devices: %m");
+ if (r == 0)
+ break;
+
+ if (type == 'c' && streq(line, "Character devices:")) {
+ good = true;
+ continue;
+ }
+
+ if (type == 'b' && streq(line, "Block devices:")) {
+ good = true;
+ continue;
+ }
+
+ if (isempty(line)) {
+ good = false;
+ continue;
+ }
+
+ if (!good)
+ continue;
+
+ p = strstrip(line);
+
+ w = strpbrk(p, WHITESPACE);
+ if (!w)
+ continue;
+ *w = 0;
+
+ r = safe_atou(p, &maj);
+ if (r < 0)
+ continue;
+ if (maj <= 0)
+ continue;
+
+ w++;
+ w += strspn(w, WHITESPACE);
+
+ if (fnmatch(name, w, 0) != 0)
+ continue;
+
+ any = true;
+ (void) allow_list_device_pattern(prog, path, type, &maj, NULL, acc);
+ }
+
+ if (!any)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Device allow list pattern \"%s\" did not match anything.", name);
+
+ return 0;
+}
+
+int bpf_devices_allow_list_static(
+ BPFProgram *prog,
+ const char *path) {
+
+ static const char auto_devices[] =
+ "/dev/null\0" "rwm\0"
+ "/dev/zero\0" "rwm\0"
+ "/dev/full\0" "rwm\0"
+ "/dev/random\0" "rwm\0"
+ "/dev/urandom\0" "rwm\0"
+ "/dev/tty\0" "rwm\0"
+ "/dev/ptmx\0" "rwm\0"
+ /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
+ "/run/systemd/inaccessible/chr\0" "rwm\0"
+ "/run/systemd/inaccessible/blk\0" "rwm\0";
+ int r = 0, k;
+
+ const char *node, *acc;
+ NULSTR_FOREACH_PAIR(node, acc, auto_devices) {
+ k = bpf_devices_allow_list_device(prog, path, node, acc);
+ if (r >= 0 && k < 0)
+ r = k;
+ }
+
+ /* PTS (/dev/pts) devices may not be duplicated, but accessed */
+ k = bpf_devices_allow_list_major(prog, path, "pts", 'c', "rw");
+ if (r >= 0 && k < 0)
+ r = k;
+
+ return r;
+}
diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h
new file mode 100644
index 0000000..5106364
--- /dev/null
+++ b/src/core/bpf-devices.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+#include "cgroup.h"
+
+typedef struct BPFProgram BPFProgram;
+
+int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list);
+int bpf_devices_apply_policy(
+ BPFProgram **prog,
+ CGroupDevicePolicy policy,
+ bool allow_list,
+ const char *cgroup_path,
+ BPFProgram **prog_installed);
+
+int bpf_devices_supported(void);
+int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, const char *acc);
+int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc);
+int bpf_devices_allow_list_static(BPFProgram *prog, const char *path);
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
new file mode 100644
index 0000000..ce3b76c
--- /dev/null
+++ b/src/core/bpf-firewall.c
@@ -0,0 +1,969 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/bpf_insn.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-program.h"
+#include "fd-util.h"
+#include "in-addr-prefix-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "unit.h"
+#include "strv.h"
+#include "virt.h"
+
+enum {
+ MAP_KEY_PACKETS,
+ MAP_KEY_BYTES,
+};
+
+enum {
+ ACCESS_ALLOWED = 1,
+ ACCESS_DENIED = 2,
+};
+
+/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
+
+static int add_lookup_instructions(
+ BPFProgram *p,
+ int map_fd,
+ int protocol,
+ bool is_ingress,
+ int verdict) {
+
+ int r, addr_offset, addr_size;
+
+ assert(p);
+ assert(map_fd >= 0);
+
+ switch (protocol) {
+
+ case ETH_P_IP:
+ addr_size = sizeof(uint32_t);
+ addr_offset = is_ingress ?
+ offsetof(struct iphdr, saddr) :
+ offsetof(struct iphdr, daddr);
+ break;
+
+ case ETH_P_IPV6:
+ addr_size = 4 * sizeof(uint32_t);
+ addr_offset = is_ingress ?
+ offsetof(struct ip6_hdr, ip6_src.s6_addr) :
+ offsetof(struct ip6_hdr, ip6_dst.s6_addr);
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ do {
+ /* Compare IPv4 with one word instruction (32bit) */
+ struct bpf_insn insn[] = {
+ /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
+
+ /*
+ * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
+ *
+ * R1: Pointer to the skb
+ * R2: Data offset
+ * R3: Destination buffer on the stack (r10 - 4)
+ * R4: Number of bytes to read (4)
+ */
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV32_IMM(BPF_REG_2, addr_offset),
+
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
+
+ BPF_MOV32_IMM(BPF_REG_4, addr_size),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+
+ /*
+ * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
+ * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
+ * has to be set to the maximum possible value.
+ *
+ * On success, the looked up value is stored in R0. For this application, the actual
+ * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
+ * matching value.
+ */
+
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
+
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+ };
+
+ /* Jump label fixup */
+ insn[0].off = ELEMENTSOF(insn) - 1;
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+
+ } while (false);
+
+ return 0;
+}
+
+static int add_instructions_for_ip_any(
+ BPFProgram *p,
+ int verdict) {
+ int r;
+
+ assert(p);
+
+ const struct bpf_insn insn[] = {
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+ };
+
+ r = bpf_program_add_instructions(p, insn, 1);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int bpf_firewall_compile_bpf(
+ Unit *u,
+ const char *prog_name,
+ bool is_ingress,
+ BPFProgram **ret,
+ bool ip_allow_any,
+ bool ip_deny_any) {
+
+ const struct bpf_insn pre_insn[] = {
+ /*
+ * When the eBPF program is entered, R1 contains the address of the skb.
+ * However, R1-R5 are scratch registers that are not preserved when calling
+ * into kernel functions, so we need to save anything that's supposed to
+ * stay around to R6-R9. Save the skb to R6.
+ */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /*
+ * Although we cannot access the skb data directly from eBPF programs used in this
+ * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
+ * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
+ * for later use.
+ */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
+
+ /*
+ * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
+ * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
+ */
+ BPF_MOV32_IMM(BPF_REG_8, 0),
+ };
+
+ /*
+ * The access checkers compiled for the configured allowance and denial lists
+ * write to R8 at runtime. The following code prepares for an early exit that
+ * skip the accounting if the packet is denied.
+ *
+ * R0 = 1
+ * if (R8 == ACCESS_DENIED)
+ * R0 = 0
+ *
+ * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
+ * is allowed to pass.
+ */
+ const struct bpf_insn post_insn[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ int accounting_map_fd, r;
+ bool access_enabled;
+
+ assert(u);
+ assert(ret);
+
+ accounting_map_fd = is_ingress ?
+ u->ip_accounting_ingress_map_fd :
+ u->ip_accounting_egress_map_fd;
+
+ access_enabled =
+ u->ipv4_allow_map_fd >= 0 ||
+ u->ipv6_allow_map_fd >= 0 ||
+ u->ipv4_deny_map_fd >= 0 ||
+ u->ipv6_deny_map_fd >= 0 ||
+ ip_allow_any ||
+ ip_deny_any;
+
+ if (accounting_map_fd < 0 && !access_enabled) {
+ *ret = NULL;
+ return 0;
+ }
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
+ if (r < 0)
+ return r;
+
+ r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
+ if (r < 0)
+ return r;
+
+ if (access_enabled) {
+ /*
+ * The simple rule this function translates into eBPF instructions is:
+ *
+ * - Access will be granted when an address matches an entry in @list_allow
+ * - Otherwise, access will be denied when an address matches an entry in @list_deny
+ * - Otherwise, access will be granted
+ */
+
+ if (u->ipv4_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv6_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv4_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv6_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (ip_allow_any) {
+ r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (ip_deny_any) {
+ r = add_instructions_for_ip_any(p, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
+ if (r < 0)
+ return r;
+
+ if (accounting_map_fd >= 0) {
+ struct bpf_insn insn[] = {
+ /*
+ * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
+ * The jump label will be fixed up later.
+ */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
+
+ /* Count packets */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Count bytes */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Allow the packet to pass */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ };
+
+ /* Jump label fixup */
+ insn[0].off = ELEMENTSOF(insn) - 1;
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+ }
+
+ do {
+ /*
+ * Exit from the eBPF program, R0 contains the verdict.
+ * 0 means the packet is denied, 1 means the packet may pass.
+ */
+ const struct bpf_insn insn[] = {
+ BPF_EXIT_INSN()
+ };
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+ } while (false);
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
+ struct in_addr_prefix *a;
+
+ assert(n_ipv4);
+ assert(n_ipv6);
+
+ SET_FOREACH(a, prefixes)
+ switch (a->family) {
+
+ case AF_INET:
+ (*n_ipv4)++;
+ break;
+
+ case AF_INET6:
+ (*n_ipv6)++;
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int bpf_firewall_add_access_items(
+ Set *prefixes,
+ int ipv4_map_fd,
+ int ipv6_map_fd,
+ int verdict) {
+
+ struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
+ struct in_addr_prefix *a;
+ uint64_t value = verdict;
+ int r;
+
+ key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
+ key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
+
+ SET_FOREACH(a, prefixes)
+ switch (a->family) {
+
+ case AF_INET:
+ key_ipv4->prefixlen = a->prefixlen;
+ memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
+
+ r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case AF_INET6:
+ key_ipv6->prefixlen = a->prefixlen;
+ memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
+
+ r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
+ if (r < 0)
+ return r;
+
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int bpf_firewall_prepare_access_maps(
+ Unit *u,
+ int verdict,
+ int *ret_ipv4_map_fd,
+ int *ret_ipv6_map_fd,
+ bool *ret_has_any) {
+
+ _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
+ size_t n_ipv4 = 0, n_ipv6 = 0;
+ Unit *p;
+ int r;
+
+ assert(ret_ipv4_map_fd);
+ assert(ret_ipv6_map_fd);
+ assert(ret_has_any);
+
+ for (p = u; p; p = UNIT_GET_SLICE(p)) {
+ CGroupContext *cc;
+ Set *prefixes;
+ bool *reduced;
+
+ cc = unit_get_cgroup_context(p);
+ if (!cc)
+ continue;
+
+ prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
+ reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
+
+ if (!*reduced) {
+ r = in_addr_prefixes_reduce(prefixes);
+ if (r < 0)
+ return r;
+
+ *reduced = true;
+ }
+
+ bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
+
+ /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
+ * needing CAP_SYS_ADMIN for allocating LPM trie map. */
+ if (in_addr_prefixes_is_any(prefixes)) {
+ *ret_has_any = true;
+ return 0;
+ }
+ }
+
+ if (n_ipv4 > 0) {
+ ipv4_map_fd = bpf_map_new(
+ BPF_MAP_TYPE_LPM_TRIE,
+ offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
+ sizeof(uint64_t),
+ n_ipv4,
+ BPF_F_NO_PREALLOC);
+ if (ipv4_map_fd < 0)
+ return ipv4_map_fd;
+ }
+
+ if (n_ipv6 > 0) {
+ ipv6_map_fd = bpf_map_new(
+ BPF_MAP_TYPE_LPM_TRIE,
+ offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
+ sizeof(uint64_t),
+ n_ipv6,
+ BPF_F_NO_PREALLOC);
+ if (ipv6_map_fd < 0)
+ return ipv6_map_fd;
+ }
+
+ for (p = u; p; p = UNIT_GET_SLICE(p)) {
+ CGroupContext *cc;
+
+ cc = unit_get_cgroup_context(p);
+ if (!cc)
+ continue;
+
+ r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
+ ipv4_map_fd, ipv6_map_fd, verdict);
+ if (r < 0)
+ return r;
+ }
+
+ *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
+ *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
+ *ret_has_any = false;
+ return 0;
+}
+
+static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
+ int r;
+
+ assert(u);
+ assert(fd_ingress);
+ assert(fd_egress);
+
+ if (enabled) {
+ if (*fd_ingress < 0) {
+ r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+ if (r < 0)
+ return r;
+
+ *fd_ingress = r;
+ }
+
+ if (*fd_egress < 0) {
+
+ r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+ if (r < 0)
+ return r;
+
+ *fd_egress = r;
+ }
+
+ } else {
+ *fd_ingress = safe_close(*fd_ingress);
+ *fd_egress = safe_close(*fd_egress);
+
+ zero(u->ip_accounting_extra);
+ }
+
+ return 0;
+}
+
+int bpf_firewall_compile(Unit *u) {
+ const char *ingress_name = NULL, *egress_name = NULL;
+ bool ip_allow_any = false, ip_deny_any = false;
+ CGroupContext *cc;
+ int r, supported;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return -EINVAL;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+ if (supported == BPF_FIREWALL_UNSUPPORTED)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF firewalling not supported, proceeding without.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+ /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
+ * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
+ * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
+ * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
+ * all, either. */
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
+
+ /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
+ * kernel). */
+ if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ ingress_name = "sd_fw_ingress";
+ egress_name = "sd_fw_egress";
+ }
+
+ /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
+ * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
+ * configuration, but we don't flush out the accounting unnecessarily */
+
+ u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+ u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+
+ u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+ u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+
+ u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+ u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+ if (u->type != UNIT_SLICE) {
+ /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
+ * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
+ * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
+ * means that all configure IP access rules *will* take effect on processes, even though we never
+ * compile them for inner nodes. */
+
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
+
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
+ }
+
+ r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
+
+ r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
+
+ r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
+
+ return 0;
+}
+
+static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
+ set_clear(*set);
+
+ STRV_FOREACH(bpf_fs_path, filter_paths) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ int r;
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
+
+ r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
+
+ r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int bpf_firewall_load_custom(Unit *u) {
+ CGroupContext *cc;
+ int r, supported;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
+ return 0;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
+ if (r < 0)
+ return r;
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
+ BPFProgram *prog;
+ int r;
+
+ assert(u);
+
+ set_clear(*set_installed);
+ r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ SET_FOREACH_MOVE(prog, *set_installed, *set) {
+ r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
+ }
+ return 0;
+}
+
+int bpf_firewall_install(Unit *u) {
+ _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
+ _cleanup_free_ char *path = NULL;
+ CGroupContext *cc;
+ int r, supported;
+ uint32_t flags;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return -EINVAL;
+ if (!u->cgroup_path)
+ return -EINVAL;
+ if (!u->cgroup_realized)
+ return -EINVAL;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+ if (supported == BPF_FIREWALL_UNSUPPORTED)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF firewalling not supported, proceeding without.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
+ (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
+
+ flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
+
+ if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
+ /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
+ * after attaching the new programs, so that there's no time window where neither program is
+ * attached. (There will be a program where both are attached, but that's OK, since this is a
+ * security feature where we rather want to lock down too much than too little */
+ ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
+ ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
+ } else {
+ /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
+ * detach them) right before attaching the new program, to minimize the time window when we
+ * don't account for IP traffic. */
+ u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+ u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+ }
+
+ if (u->ip_bpf_egress) {
+ r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
+
+ /* Remember that this BPF program is installed now. */
+ u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
+ }
+
+ if (u->ip_bpf_ingress) {
+ r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
+
+ u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
+ }
+
+ /* And now, definitely get rid of the old programs, and detach them */
+ ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
+ ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
+
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
+ if (r < 0)
+ return r;
+
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
+ uint64_t key, packets;
+ int r;
+
+ if (map_fd < 0)
+ return -EBADF;
+
+ if (ret_packets) {
+ key = MAP_KEY_PACKETS;
+ r = bpf_map_lookup_element(map_fd, &key, &packets);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_bytes) {
+ key = MAP_KEY_BYTES;
+ r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_packets)
+ *ret_packets = packets;
+
+ return 0;
+}
+
+int bpf_firewall_reset_accounting(int map_fd) {
+ uint64_t key, value = 0;
+ int r;
+
+ if (map_fd < 0)
+ return -EBADF;
+
+ key = MAP_KEY_PACKETS;
+ r = bpf_map_update_element(map_fd, &key, &value);
+ if (r < 0)
+ return r;
+
+ key = MAP_KEY_BYTES;
+ return bpf_map_update_element(map_fd, &key, &value);
+}
+
+static int bpf_firewall_unsupported_reason = 0;
+
+int bpf_firewall_supported(void) {
+ const struct bpf_insn trivial[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN()
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+ static int supported = -1;
+ union bpf_attr attr;
+ int r;
+
+ /* Checks whether BPF firewalling is supported. For this, we check the following things:
+ *
+ * - whether the unified hierarchy is being used
+ * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
+ * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
+ */
+ if (supported >= 0)
+ return supported;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
+ if (r == 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
+ "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ r = bpf_program_load_kernel(program, NULL, 0);
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
+ * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
+ * program if we can't do a thing with it later?
+ *
+ * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
+ * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
+ * parameters are validated however, and that'll fail with EBADF then. */
+
+ // FIXME: Clang doesn't 0-pad with structured initialization, causing
+ // the kernel to reject the bpf_attr as invalid. See:
+ // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
+ // Ideally it should behave like GCC, so that we can remove these workarounds.
+ zero(attr);
+ attr.attach_type = BPF_CGROUP_INET_EGRESS;
+ attr.target_fd = -1;
+ attr.attach_bpf_fd = -1;
+
+ if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
+ if (errno != EBADF) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* YAY! */
+ } else {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+ "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
+ "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
+ * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
+ * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
+ * get EINVAL if it's not supported, and EBADF as before if it is available.
+ * Use probe result as the indicator that program name is also supported since they both were
+ * added in kernel 4.15. */
+
+ zero(attr);
+ attr.attach_type = BPF_CGROUP_INET_EGRESS;
+ attr.target_fd = -1;
+ attr.attach_bpf_fd = -1;
+ attr.attach_flags = BPF_F_ALLOW_MULTI;
+
+ if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
+ if (errno == EBADF) {
+ log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
+ return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
+ }
+
+ if (errno == EINVAL)
+ log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
+ else
+ log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
+
+ return supported = BPF_FIREWALL_SUPPORTED;
+ } else {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+ "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
+ "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+}
+
+void emit_bpf_firewall_warning(Unit *u) {
+ static bool warned = false;
+
+ assert(u);
+ assert(u->manager);
+
+ if (warned || MANAGER_IS_TEST_RUN(u->manager))
+ return;
+
+ bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
+
+ log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
+ "unit configures an IP firewall, but %s.\n"
+ "(This warning is only shown for the first unit using IP firewalling.)",
+ getuid() != 0 ? "not running as root" :
+ "the local system does not support BPF/cgroup firewalling");
+ warned = true;
+}
+
+void bpf_firewall_close(Unit *u) {
+ assert(u);
+
+ u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
+ u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
+
+ u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+ u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+ u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+ u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+ u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+ u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+ u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+ u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+
+ u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
+ u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
+ u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
+ u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
+}
diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h
new file mode 100644
index 0000000..58b401f
--- /dev/null
+++ b/src/core/bpf-firewall.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+#include "unit.h"
+
+enum {
+ BPF_FIREWALL_UNSUPPORTED = 0,
+ BPF_FIREWALL_SUPPORTED = 1,
+ BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2,
+};
+
+int bpf_firewall_supported(void);
+
+int bpf_firewall_compile(Unit *u);
+int bpf_firewall_install(Unit *u);
+int bpf_firewall_load_custom(Unit *u);
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets);
+int bpf_firewall_reset_accounting(int map_fd);
+
+void emit_bpf_firewall_warning(Unit *u);
+
+void bpf_firewall_close(Unit *u);
diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c
new file mode 100644
index 0000000..cff2f61
--- /dev/null
+++ b/src/core/bpf-foreign.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-foreign.h"
+#include "bpf-program.h"
+#include "cgroup.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "mountpoint-util.h"
+#include "set.h"
+#include "stat-util.h"
+
+typedef struct BPFForeignKey BPFForeignKey;
+struct BPFForeignKey {
+ uint32_t prog_id;
+ uint32_t attach_type;
+};
+
+static int bpf_foreign_key_new(uint32_t prog_id,
+ enum bpf_attach_type attach_type,
+ BPFForeignKey **ret) {
+ _cleanup_free_ BPFForeignKey *p = NULL;
+
+ assert(ret);
+
+ p = new(BPFForeignKey, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFForeignKey) {
+ .prog_id = prog_id,
+ .attach_type = attach_type,
+ };
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) {
+ int r = CMP(a->prog_id, b->prog_id);
+ if (r != 0)
+ return r;
+
+ return CMP(a->attach_type, b->attach_type);
+}
+
+static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) {
+ siphash24_compress(&p->prog_id, sizeof(p->prog_id), h);
+ siphash24_compress(&p->attach_type, sizeof(p->attach_type), h);
+}
+
+DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops,
+ BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free,
+ BPFProgram, bpf_program_free);
+
+static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) {
+ const BPFForeignKey *key;
+ BPFProgram *prog;
+ int r, ret = 0;
+
+ assert(u);
+
+ HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) {
+ r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "bpf-foreign: Attaching foreign BPF program to cgroup %s failed: %m", path);
+ if (ret >= 0)
+ ret = r;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Prepare foreign BPF program for installation:
+ * - Load the program from BPF filesystem to the kernel;
+ * - Store program FD identified by program ID and attach type in the unit.
+ */
+static int bpf_foreign_prepare(
+ Unit *u,
+ enum bpf_attach_type attach_type,
+ const char *bpffs_path) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ _cleanup_free_ BPFForeignKey *key = NULL;
+ uint32_t prog_id;
+ int r;
+
+ assert(u);
+ assert(bpffs_path);
+
+ r = path_is_fs_type(bpffs_path, BPF_FS_MAGIC);
+ if (r == -ENOENT) {
+ log_unit_warning_errno(u, r, "bpf-foreign: foreign program %s does not exist, skipping.", bpffs_path);
+ return 0;
+ }
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-foreign: Failed to determine filesystem type of %s: %m", bpffs_path);
+ if (r == 0)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-foreign: Path in BPF filesystem is expected.");
+
+ r = bpf_program_new_from_bpffs_path(bpffs_path, &prog);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m");
+
+ r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to get BPF program id from fd: %m");
+
+ r = bpf_foreign_key_new(prog_id, attach_type, &key);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path);
+
+ r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog);
+ if (r == -EEXIST) {
+ log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m");
+ return 0;
+ }
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to put foreign BPF program into map: %m");
+
+ TAKE_PTR(key);
+ TAKE_PTR(prog);
+
+ return 0;
+}
+
+int bpf_foreign_install(Unit *u) {
+ _cleanup_free_ char *cgroup_path = NULL;
+ CGroupContext *cc;
+ int r, ret = 0;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m");
+
+ LIST_FOREACH(programs, p, cc->bpf_foreign_programs) {
+ r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path);
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI);
+ return ret < 0 ? ret : r;
+}
diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h
new file mode 100644
index 0000000..e387b1b
--- /dev/null
+++ b/src/core/bpf-foreign.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include "unit.h"
+
+static inline int bpf_foreign_supported(void) {
+ return cg_all_unified();
+}
+
+/*
+ * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity
+ * external to systemd.
+ */
+int bpf_foreign_install(Unit *u);
diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c
new file mode 100644
index 0000000..a3726d9
--- /dev/null
+++ b/src/core/bpf-lsm.c
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "log.h"
+#include "manager.h"
+#include "mkdir.h"
+#include "nulstr-util.h"
+#include "stat-util.h"
+#include "strv.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+
+#define CGROUP_HASH_SIZE_MAX 2048
+
+static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) {
+ /* restrict_fs_bpf__destroy handles object == NULL case */
+ (void) restrict_fs_bpf__destroy(obj);
+
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
+
+static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+ assert(prog);
+
+ link = sym_bpf_program__attach_lsm(prog);
+
+ /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
+ * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
+ * BPF_LSM_MAC attach type) is not supported. */
+ return sym_libbpf_get_error(link) == 0;
+}
+
+static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ _cleanup_close_ int inner_map_fd = -1;
+ int r;
+
+ assert(ret_obj);
+
+ obj = restrict_fs_bpf__open();
+ if (!obj)
+ return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m");
+
+ /* TODO Maybe choose a number based on runtime information? */
+ r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m",
+ sym_bpf_map__name(obj->maps.cgroup_hash));
+
+ /* Dummy map to satisfy the verifier */
+ inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL);
+ if (inner_map_fd < 0)
+ return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m");
+
+ r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m");
+
+ r = restrict_fs_bpf__load(obj);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m");
+
+ *ret_obj = TAKE_PTR(obj);
+
+ return 0;
+}
+
+static int mac_bpf_use(void) {
+ _cleanup_free_ char *lsm_list = NULL;
+ static int cached_use = -1;
+ int r;
+
+ if (cached_use >= 0)
+ return cached_use;
+
+ cached_use = 0;
+
+ r = read_one_line_file("/sys/kernel/security/lsm", &lsm_list);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_notice_errno(r, "bpf-lsm: Failed to read /sys/kernel/security/lsm, assuming bpf is unavailable: %m");
+ return 0;
+ }
+
+ for (const char *p = lsm_list;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, ",", 0);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_notice_errno(r, "bpf-lsm: Failed to parse /sys/kernel/security/lsm, assuming bpf is unavailable: %m");
+ return 0;
+ }
+
+ if (streq(word, "bpf"))
+ return cached_use = 1;
+ }
+}
+
+bool lsm_bpf_supported(bool initialize) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+ if (!initialize)
+ return false;
+
+ if (!cgroup_bpf_supported())
+ return (supported = false);
+
+ r = mac_bpf_use();
+ if (r < 0) {
+ log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m");
+ return (supported = false);
+ }
+
+ if (r == 0) {
+ log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported");
+ return (supported = false);
+ }
+
+ r = prepare_restrict_fs_bpf(&obj);
+ if (r < 0)
+ return (supported = false);
+
+ if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) {
+ log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-lsm: Failed to link program; assuming BPF LSM is not available");
+ return (supported = false);
+ }
+
+ return (supported = true);
+}
+
+int lsm_bpf_setup(Manager *m) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+ int r;
+
+ assert(m);
+
+ r = prepare_restrict_fs_bpf(&obj);
+ if (r < 0)
+ return r;
+
+ link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems);
+ r = sym_libbpf_get_error(link);
+ if (r != 0)
+ return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m",
+ sym_bpf_program__name(obj->progs.restrict_filesystems));
+
+ log_info("bpf-lsm: LSM BPF program attached");
+
+ obj->links.restrict_filesystems = TAKE_PTR(link);
+ m->restrict_fs = TAKE_PTR(obj);
+
+ return 0;
+}
+
+int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list) {
+ uint32_t dummy_value = 1, zero = 0;
+ const char *fs;
+ const statfs_f_type_t *magic;
+ int r;
+
+ assert(filesystems);
+ assert(u);
+
+ if (!u->manager->restrict_fs)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-lsm: BPF LSM object is not installed, has setup failed?");
+
+ int inner_map_fd = compat_bpf_map_create(
+ BPF_MAP_TYPE_HASH,
+ NULL,
+ sizeof(uint32_t),
+ sizeof(uint32_t),
+ 128U, /* Should be enough for all filesystem types */
+ NULL);
+ if (inner_map_fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Failed to create inner BPF map: %m");
+
+ int outer_map_fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
+ if (outer_map_fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m");
+
+ if (sym_bpf_map_update_elem(outer_map_fd, &u->cgroup_id, &inner_map_fd, BPF_ANY) != 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Error populating BPF map: %m");
+
+ uint32_t allow = allow_list;
+
+ /* Use key 0 to store whether this is an allow list or a deny list */
+ if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Error initializing map: %m");
+
+ SET_FOREACH(fs, filesystems) {
+ r = fs_type_from_string(fs, &magic);
+ if (r < 0) {
+ log_unit_warning(u, "bpf-lsm: Invalid filesystem name '%s', ignoring.", fs);
+ continue;
+ }
+
+ log_unit_debug(u, "bpf-lsm: Restricting filesystem access to '%s'", fs);
+
+ for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
+ if (magic[i] == 0)
+ break;
+
+ if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) {
+ r = log_unit_error_errno(u, errno, "bpf-lsm: Failed to update BPF map: %m");
+
+ if (sym_bpf_map_delete_elem(outer_map_fd, &u->cgroup_id) != 0)
+ log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m");
+
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+ assert(u);
+ assert(u->manager);
+
+ /* If we never successfully detected support, there is nothing to clean up. */
+ if (!lsm_bpf_supported(/* initialize = */ false))
+ return 0;
+
+ if (!u->manager->restrict_fs)
+ return 0;
+
+ if (u->cgroup_id == 0)
+ return 0;
+
+ int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
+ if (fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m");
+
+ if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT)
+ return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m");
+
+ return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+ assert(unit);
+ assert(unit->manager);
+
+ if (!unit->manager->restrict_fs)
+ return -ENOMEDIUM;
+
+ return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash);
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+ restrict_fs_bpf__destroy(prog);
+}
+#else /* ! BPF_FRAMEWORK */
+bool lsm_bpf_supported(bool initialize) {
+ return false;
+}
+
+int lsm_bpf_setup(Manager *m) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m");
+}
+
+int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, const bool allow_list) {
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m");
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+ return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+ return -ENOMEDIUM;
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+ return;
+}
+#endif
+
+int lsm_bpf_parse_filesystem(
+ const char *name,
+ Set **filesystems,
+ FilesystemParseFlags flags,
+ const char *unit,
+ const char *filename,
+ unsigned line) {
+ int r;
+
+ assert(name);
+ assert(filesystems);
+
+ if (name[0] == '@') {
+ const FilesystemSet *set;
+ const char *i;
+
+ set = filesystem_set_find(name);
+ if (!set) {
+ log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
+ "bpf-lsm: Unknown filesystem group, ignoring: %s", name);
+ return 0;
+ }
+
+ NULSTR_FOREACH(i, set->value) {
+ /* Call ourselves again, for the group to parse. Note that we downgrade logging here
+ * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table
+ * are our own problem, not a problem in user configuration data and we shouldn't
+ * pretend otherwise by complaining about them. */
+ r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+ } else {
+ /* If we previously wanted to forbid access to a filesystem and now
+ * we want to allow it, then remove it from the list. */
+ if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) {
+ r = set_put_strdup(filesystems, name);
+ if (r == -ENOMEM)
+ return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM;
+ if (r < 0 && r != -EEXIST) /* When already in set, ignore */
+ return r;
+ } else
+ free(set_remove(*filesystems, name));
+ }
+
+ return 0;
+}
diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h
new file mode 100644
index 0000000..dff5812
--- /dev/null
+++ b/src/core/bpf-lsm.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hashmap.h"
+
+typedef enum FilesystemParseFlags {
+ FILESYSTEM_PARSE_INVERT = 1 << 0,
+ FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
+ FILESYSTEM_PARSE_LOG = 1 << 2,
+} FilesystemParseFlags;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+
+typedef struct restrict_fs_bpf restrict_fs_bpf;
+
+bool lsm_bpf_supported(bool initialize);
+int lsm_bpf_setup(Manager *m);
+int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list);
+int lsm_bpf_cleanup(const Unit *u);
+int lsm_bpf_map_restrict_fs_fd(Unit *u);
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog);
+int lsm_bpf_parse_filesystem(const char *name,
+ Set **filesystems,
+ FilesystemParseFlags flags,
+ const char *unit,
+ const char *filename,
+ unsigned line);
diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c
new file mode 100644
index 0000000..660ffdb
--- /dev/null
+++ b/src/core/bpf-socket-bind.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if BPF_FRAMEWORK
+#include <bpf/bpf.h>
+#endif
+
+#include "fd-util.h"
+#include "bpf-socket-bind.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/socket_bind/socket-bind-api.bpf.h"
+#include "bpf/socket_bind/socket-bind-skel.h"
+
+static struct socket_bind_bpf *socket_bind_bpf_free(struct socket_bind_bpf *obj) {
+ /* socket_bind_bpf__destroy handles object == NULL case */
+ (void) socket_bind_bpf__destroy(obj);
+
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct socket_bind_bpf *, socket_bind_bpf_free);
+
+static int update_rules_map(
+ int map_fd,
+ CGroupSocketBindItem *head) {
+
+ uint32_t i = 0;
+
+ assert(map_fd >= 0);
+
+ LIST_FOREACH(socket_bind_items, item, head) {
+ struct socket_bind_rule val = {
+ .address_family = (uint32_t) item->address_family,
+ .protocol = item->ip_protocol,
+ .nr_ports = item->nr_ports,
+ .port_min = item->port_min,
+ };
+
+ uint32_t key = i++;
+
+ if (sym_bpf_map_update_elem(map_fd, &key, &val, BPF_ANY) != 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int prepare_socket_bind_bpf(
+ Unit *u,
+ CGroupSocketBindItem *allow,
+ CGroupSocketBindItem *deny,
+ struct socket_bind_bpf **ret_obj) {
+
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ size_t allow_count = 0, deny_count = 0;
+ int allow_map_fd, deny_map_fd, r;
+
+ assert(ret_obj);
+
+ LIST_FOREACH(socket_bind_items, item, allow)
+ allow_count++;
+
+ LIST_FOREACH(socket_bind_items, item, deny)
+ deny_count++;
+
+ if (allow_count > SOCKET_BIND_MAX_RULES)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+ if (deny_count > SOCKET_BIND_MAX_RULES)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+ obj = socket_bind_bpf__open();
+ if (!obj)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "bpf-socket-bind: Failed to open BPF object: %m");
+
+ if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_allow, MAX(allow_count, 1u)) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+ if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_deny, MAX(deny_count, 1u)) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+ if (socket_bind_bpf__load(obj) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno,
+ "bpf-socket-bind: Failed to load BPF object: %m");
+
+ allow_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_allow);
+ assert(allow_map_fd >= 0);
+
+ r = update_rules_map(allow_map_fd, allow);
+ if (r < 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "bpf-socket-bind: Failed to put socket bind allow rules into BPF map '%s'",
+ sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+ deny_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_deny);
+ assert(deny_map_fd >= 0);
+
+ r = update_rules_map(deny_map_fd, deny);
+ if (r < 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "bpf-socket-bind: Failed to put socket bind deny rules into BPF map '%s'",
+ sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+ *ret_obj = TAKE_PTR(obj);
+ return 0;
+}
+
+int bpf_socket_bind_supported(void) {
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ int r;
+
+ if (!cgroup_bpf_supported())
+ return false;
+
+ if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) {
+ log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported");
+ return false;
+ }
+
+ r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj);
+ if (r < 0) {
+ log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m");
+ return false;
+ }
+
+ return bpf_can_link_program(obj->progs.sd_bind4);
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+ int r;
+
+ assert(u);
+
+ if (!u->initial_socket_bind_link_fds) {
+ u->initial_socket_bind_link_fds = fdset_new();
+ if (!u->initial_socket_bind_link_fds)
+ return log_oom();
+ }
+
+ r = fdset_put(u->initial_socket_bind_link_fds, fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd);
+
+ return 0;
+}
+
+static int socket_bind_install_impl(Unit *u) {
+ _cleanup_(bpf_link_freep) struct bpf_link *ipv4 = NULL, *ipv6 = NULL;
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ _cleanup_free_ char *cgroup_path = NULL;
+ _cleanup_close_ int cgroup_fd = -1;
+ CGroupContext *cc;
+ int r;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m");
+
+ if (!cc->socket_bind_allow && !cc->socket_bind_deny)
+ return 0;
+
+ r = prepare_socket_bind_bpf(u, cc->socket_bind_allow, cc->socket_bind_deny, &obj);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to load BPF object: %m");
+
+ cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC, 0);
+ if (cgroup_fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path);
+
+ ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd);
+ r = sym_libbpf_get_error(ipv4);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+ sym_bpf_program__name(obj->progs.sd_bind4));
+
+ ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd);
+ r = sym_libbpf_get_error(ipv6);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+ sym_bpf_program__name(obj->progs.sd_bind6));
+
+ u->ipv4_socket_bind_link = TAKE_PTR(ipv4);
+ u->ipv6_socket_bind_link = TAKE_PTR(ipv6);
+
+ return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+ int r;
+
+ assert(u);
+
+ r = socket_bind_install_impl(u);
+ if (r == -ENOMEM)
+ return r;
+
+ fdset_close(u->initial_socket_bind_link_fds);
+ return r;
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+
+ r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link);
+ if (r < 0)
+ return r;
+
+ return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link);
+}
+
+#else /* ! BPF_FRAMEWORK */
+int bpf_socket_bind_supported(void) {
+ return false;
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+ return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-socket-bind: Failed to install; BPF framework is not supported");
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+ return 0;
+}
+#endif
diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h
new file mode 100644
index 0000000..7d426df
--- /dev/null
+++ b/src/core/bpf-socket-bind.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+int bpf_socket_bind_supported(void);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec. FDs will be closed at the end of
+ * socket_bind_install. */
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd);
+
+int bpf_socket_bind_install(Unit *u);
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds);
diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c
new file mode 100644
index 0000000..84170da
--- /dev/null
+++ b/src/core/bpf-util.c
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-dlopen.h"
+#include "bpf-util.h"
+#include "cgroup-util.h"
+#include "log.h"
+
+bool cgroup_bpf_supported(void) {
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0) {
+ log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+ return (supported = false);
+ }
+
+ if (r == 0) {
+ log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Not running with unified cgroup hierarchy, disabling cgroup BPF features.");
+ return (supported = false);
+ }
+
+ r = dlopen_bpf();
+ if (r < 0) {
+ log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO,
+ r, "Failed to open libbpf, cgroup BPF features disabled: %m");
+ return (supported = false);
+ }
+
+ return (supported = true);
+}
diff --git a/src/core/bpf-util.h b/src/core/bpf-util.h
new file mode 100644
index 0000000..a6c55cd
--- /dev/null
+++ b/src/core/bpf-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdbool.h>
+
+bool cgroup_bpf_supported(void);
diff --git a/src/core/bpf/meson.build b/src/core/bpf/meson.build
new file mode 100644
index 0000000..f654016
--- /dev/null
+++ b/src/core/bpf/meson.build
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: LGPL-2.1+
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+bpf_clang_flags = [
+ '-std=gnu11',
+ '-Wno-compare-distinct-pointer-types',
+ '-O2',
+ '-target',
+ 'bpf',
+ '-g',
+ '-c',
+]
+
+bpf_gcc_flags = [
+ '-std=gnu11',
+ '-O2',
+ '-mkernel=5.2',
+ '-mcpu=v3',
+ '-mco-re',
+ '-gbtf',
+ '-c',
+]
+
+# Generate defines that are appropriate to tell the compiler what architecture
+# we're compiling for. By default we just map meson's cpu_family to __<cpu_family>__.
+# This dictionary contains the exceptions where this doesn't work.
+#
+# C.f. https://mesonbuild.com/Reference-tables.html#cpu-families
+# and src/basic/missing_syscall_def.h.
+cpu_arch_defines = {
+ 'ppc' : ['-D__powerpc__'],
+ 'ppc64' : ['-D__powerpc64__', '-D_CALL_ELF=2'],
+ 'riscv32' : ['-D__riscv', '-D__riscv_xlen=32'],
+ 'riscv64' : ['-D__riscv', '-D__riscv_xlen=64'],
+ 'x86' : ['-D__i386__'],
+
+ # For arm, assume hardware fp is available.
+ 'arm' : ['-D__arm__', '-D__ARM_PCS_VFP'],
+}
+
+bpf_arch_flags = cpu_arch_defines.get(host_machine.cpu_family(),
+ ['-D__@0@__'.format(host_machine.cpu_family())])
+if bpf_compiler == 'gcc'
+ bpf_arch_flags += ['-m' + host_machine.endian() + '-endian']
+endif
+
+libbpf_include_dir = libbpf.get_variable(pkgconfig : 'includedir')
+
+bpf_o_unstripped_cmd = []
+if bpf_compiler == 'clang'
+ bpf_o_unstripped_cmd += [
+ clang,
+ bpf_clang_flags,
+ bpf_arch_flags,
+ ]
+elif bpf_compiler == 'gcc'
+ bpf_o_unstripped_cmd += [
+ bpf_gcc,
+ bpf_gcc_flags,
+ bpf_arch_flags,
+ ]
+endif
+
+bpf_o_unstripped_cmd += ['-I.']
+
+if not meson.is_cross_build() and bpf_compiler == 'clang'
+ target_triplet_cmd = run_command('gcc', '-dumpmachine', check: false)
+ if target_triplet_cmd.returncode() == 0
+ target_triplet = target_triplet_cmd.stdout().strip()
+ bpf_o_unstripped_cmd += [
+ '-isystem',
+ '/usr/include/@0@'.format(target_triplet)
+ ]
+ endif
+endif
+
+bpf_o_unstripped_cmd += [
+ '-idirafter',
+ libbpf_include_dir,
+ '@INPUT@',
+ '-o',
+ '@OUTPUT@'
+]
+
+if bpftool_strip
+ bpf_o_cmd = [
+ bpftool,
+ 'gen',
+ 'object',
+ '@OUTPUT@',
+ '@INPUT@'
+ ]
+elif bpf_compiler == 'clang'
+ bpf_o_cmd = [
+ llvm_strip,
+ '-g',
+ '@INPUT@',
+ '-o',
+ '@OUTPUT@'
+ ]
+endif
+
+skel_h_cmd = [
+ bpftool,
+ 'gen',
+ 'skeleton',
+ '@INPUT@'
+]
diff --git a/src/core/bpf/restrict_fs/meson.build b/src/core/bpf/restrict_fs/meson.build
new file mode 100644
index 0000000..69cde02
--- /dev/null
+++ b/src/core/bpf/restrict_fs/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+restrict_fs_bpf_o_unstripped = custom_target(
+ 'restrict-fs.bpf.unstripped.o',
+ input : 'restrict-fs.bpf.c',
+ output : 'restrict-fs.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+restrict_fs_bpf_o = custom_target(
+ 'restrict-fs.bpf.o',
+ input : restrict_fs_bpf_o_unstripped,
+ output : 'restrict-fs.bpf.o',
+ command : bpf_o_cmd)
+
+restrict_fs_skel_h = custom_target(
+ 'restrict-fs.skel.h',
+ input : restrict_fs_bpf_o,
+ output : 'restrict-fs.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/restrict_fs/restrict-fs-skel.h b/src/core/bpf/restrict_fs/restrict-fs-skel.h
new file mode 100644
index 0000000..412cf62
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_fs/restrict-fs.skel.h"
diff --git a/src/core/bpf/restrict_fs/restrict-fs.bpf.c b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
new file mode 100644
index 0000000..eb5ed3e
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct super_block {
+ unsigned long int s_magic;
+} __attribute__((preserve_access_index));
+
+struct inode {
+ struct super_block *i_sb;
+} __attribute__((preserve_access_index));
+
+struct file {
+ struct inode *f_inode;
+} __attribute__((preserve_access_index));
+
+/*
+ * max_entries is set from user space with the bpf_map__set_max_entries helper.
+ * */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __type(key, uint64_t); /* cgroup ID */
+ __type(value, uint32_t); /* fs magic set */
+} cgroup_hash SEC(".maps");
+
+SEC("lsm/file_open")
+int BPF_PROG(restrict_filesystems, struct file *file, int ret)
+{
+ unsigned long raw_magic_number;
+ uint64_t cgroup_id;
+ uint32_t *value, *magic_map, magic_number, zero = 0, *is_allow;
+
+ /* ret is the return value from the previous BPF program or 0 if it's
+ * the first hook */
+ if (ret != 0)
+ return ret;
+
+ BPF_CORE_READ_INTO(&raw_magic_number, file, f_inode, i_sb, s_magic);
+ /* super_block.s_magic is unsigned long, but magic_map keys are
+ * uint32_t. Using s_magic as-is would fail on big-endian systems,
+ * which have 64-bit unsigned long. So cast it. */
+ magic_number = (uint32_t)raw_magic_number;
+
+ cgroup_id = bpf_get_current_cgroup_id();
+
+ magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id);
+ if (!magic_map)
+ return 0;
+
+ is_allow = bpf_map_lookup_elem(magic_map, &zero);
+ if (!is_allow)
+ /* Malformed map, it doesn't include whether it's an allow list
+ * or a deny list. Allow. */
+ return 0;
+
+ if (*is_allow) {
+ /* Allow-list: Allow access only if magic_number present in inner map */
+ if (!bpf_map_lookup_elem(magic_map, &magic_number))
+ return -EPERM;
+ } else {
+ /* Deny-list: Allow access only if magic_number is not present in inner map */
+ if (bpf_map_lookup_elem(magic_map, &magic_number))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/core/bpf/restrict_ifaces/meson.build b/src/core/bpf/restrict_ifaces/meson.build
new file mode 100644
index 0000000..5f36178
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+restrict_ifaces_bpf_o_unstripped = custom_target(
+ 'restrict-ifaces.bpf.unstripped.o',
+ input : 'restrict-ifaces.bpf.c',
+ output : 'restrict-ifaces.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+restrict_ifaces_bpf_o = custom_target(
+ 'restrict-ifaces.bpf.o',
+ input : restrict_ifaces_bpf_o_unstripped,
+ output : 'restrict-ifaces.bpf.o',
+ command : bpf_o_cmd)
+
+restrict_ifaces_skel_h = custom_target(
+ 'restrict-ifaces.skel.h',
+ input : restrict_ifaces_bpf_o,
+ output : 'restrict-ifaces.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
new file mode 100644
index 0000000..f937490
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_ifaces/restrict-ifaces.skel.h"
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
new file mode 100644
index 0000000..32cde5c
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* <linux/bpf.h> must precede <bpf/bpf_helpers.h> due to integer types
+ * in bpf helpers signatures.
+ */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+const volatile __u8 is_allow_list = 0;
+
+/* Map containing the network interfaces indexes.
+ * The interpretation of the map depends on the value of is_allow_list.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __u32);
+ __type(value, __u8);
+} sd_restrictif SEC(".maps");
+
+#define DROP 0
+#define PASS 1
+
+static __always_inline int restrict_network_interfaces_impl(const struct __sk_buff *sk) {
+ __u32 zero = 0, ifindex;
+ __u8 *lookup_result;
+
+ ifindex = sk->ifindex;
+ lookup_result = bpf_map_lookup_elem(&sd_restrictif, &ifindex);
+ if (is_allow_list) {
+ /* allow-list: let the packet pass if iface in the list */
+ if (lookup_result)
+ return PASS;
+ } else {
+ /* deny-list: let the packet pass if iface *not* in the list */
+ if (!lookup_result)
+ return PASS;
+ }
+
+ return DROP;
+}
+
+SEC("cgroup_skb/egress")
+int sd_restrictif_e(const struct __sk_buff *sk) {
+ return restrict_network_interfaces_impl(sk);
+}
+
+SEC("cgroup_skb/ingress")
+int sd_restrictif_i(const struct __sk_buff *sk) {
+ return restrict_network_interfaces_impl(sk);
+}
+
+static const char _license[] SEC("license") = "LGPL-2.1-or-later";
diff --git a/src/core/bpf/socket_bind/meson.build b/src/core/bpf/socket_bind/meson.build
new file mode 100644
index 0000000..05a2b9d
--- /dev/null
+++ b/src/core/bpf/socket_bind/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+socket_bind_bpf_o_unstripped = custom_target(
+ 'socket-bind.bpf.unstripped.o',
+ input : 'socket-bind.bpf.c',
+ output : 'socket-bind.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+socket_bind_bpf_o = custom_target(
+ 'socket-bind.bpf.o',
+ input : socket_bind_bpf_o_unstripped,
+ output : 'socket-bind.bpf.o',
+ command : bpf_o_cmd)
+
+socket_bind_skel_h = custom_target(
+ 'socket-bind.skel.h',
+ input : socket_bind_bpf_o,
+ output : 'socket-bind.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
new file mode 100644
index 0000000..277b9bb
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include <linux/types.h>
+
+/*
+ * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook
+ * through bpf_sock_addr struct.
+ * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6.
+ * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the
+ * rest of a rule applicable for both IPv4 and IPv6 addresses.
+ * If matching by family is either successful or bypassed, a rule and a socket
+ * are matched by ip protocol.
+ * If 'protocol' is 0, matching is bypassed.
+ * 'nr_ports' and 'port_min' fields specify a set of ports to match a user port
+ * with.
+ * If 'nr_ports' is 0, matching by port is bypassed, making that rule applicable
+ * for all possible ports, e.g. [1, 65535] range. Thus a rule with
+ * 'address_family', 'protocol' and 'nr_ports' equal to AF_UNSPEC, 0 and 0
+ * correspondingly forms 'allow any' or 'deny any' cases.
+ * For positive 'nr_ports', a user_port lying in a range from 'port_min' to'
+ * 'port_min' + 'nr_ports' exclusively is considered to be a match. 'nr_ports'
+ * equalling to 1 forms a rule for a single port.
+ * Ports are in host order.
+ *
+ * Examples:
+ * AF_UNSPEC, 1, 0, 7777: match IPv4 and IPv6 addresses with 7777 user port;
+ *
+ * AF_INET, 1023, 0, 1: match IPv4 addresses with user port in [1, 1023]
+ * range inclusively;
+ *
+ * AF_INET6, 0, 0, 0: match IPv6 addresses;
+ *
+ * AF_UNSPEC, 0, 0, 0: match IPv4 and IPv6 addresses;
+ *
+ * AF_INET6, IPPROTO_TCP, 0, 0: match IPv6/TCP addresses.
+ */
+
+struct socket_bind_rule {
+ __u32 address_family;
+ __u32 protocol;
+ __u16 nr_ports;
+ __u16 port_min;
+};
+
+#define SOCKET_BIND_MAX_RULES 128
diff --git a/src/core/bpf/socket_bind/socket-bind-skel.h b/src/core/bpf/socket_bind/socket-bind-skel.h
new file mode 100644
index 0000000..e0d1626
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/socket_bind/socket-bind.skel.h"
diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c
new file mode 100644
index 0000000..b7972a8
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind.bpf.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "socket-bind-api.bpf.h"
+/* <linux/types.h> must precede <bpf/bpf_helpers.h> due to
+ * <bpf/bpf_helpers.h> does not depend from type header by design.
+ */
+#include <linux/types.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+
+/*
+ * max_entries is set from user space with bpf_map__set_max_entries helper.
+ */
+struct socket_bind_map_t {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, struct socket_bind_rule);
+};
+
+enum socket_bind_action {
+ SOCKET_BIND_DENY = 0,
+ SOCKET_BIND_ALLOW = 1,
+};
+
+struct socket_bind_map_t sd_bind_allow SEC(".maps");
+struct socket_bind_map_t sd_bind_deny SEC(".maps");
+
+static __always_inline bool match_af(
+ __u8 address_family, const struct socket_bind_rule *r) {
+ return r->address_family == AF_UNSPEC || address_family == r->address_family;
+}
+
+static __always_inline bool match_protocol(
+ __u32 protocol, const struct socket_bind_rule *r) {
+ return r->protocol == 0 || r->protocol == protocol;
+}
+
+static __always_inline bool match_user_port(
+ __u16 port, const struct socket_bind_rule *r) {
+ return r->nr_ports == 0 ||
+ (port >= r->port_min && port < r->port_min + (__u32) r->nr_ports);
+}
+
+static __always_inline bool match(
+ __u8 address_family,
+ __u32 protocol,
+ __u16 port,
+ const struct socket_bind_rule *r) {
+ return match_af(address_family, r) &&
+ match_protocol(protocol, r) &&
+ match_user_port(port, r);
+}
+
+static __always_inline bool match_rules(
+ struct bpf_sock_addr *ctx,
+ struct socket_bind_map_t *rules) {
+ volatile __u32 user_port = ctx->user_port;
+ __u16 port = (__u16)bpf_ntohs(user_port);
+
+ for (__u32 i = 0; i < SOCKET_BIND_MAX_RULES; ++i) {
+ const __u32 key = i;
+ const struct socket_bind_rule *rule = bpf_map_lookup_elem(rules, &key);
+
+ /* Lookup returns NULL if iterator is advanced past the last
+ * element put in the map. */
+ if (!rule)
+ break;
+
+ if (match(ctx->user_family, ctx->protocol, port, rule))
+ return true;
+ }
+
+ return false;
+}
+
+static __always_inline int bind_socket(struct bpf_sock_addr *ctx) {
+ if (match_rules(ctx, &sd_bind_allow))
+ return SOCKET_BIND_ALLOW;
+
+ if (match_rules(ctx, &sd_bind_deny))
+ return SOCKET_BIND_DENY;
+
+ return SOCKET_BIND_ALLOW;
+}
+
+SEC("cgroup/bind4")
+int sd_bind4(struct bpf_sock_addr *ctx) {
+ if (ctx->user_family != AF_INET || ctx->family != AF_INET)
+ return SOCKET_BIND_ALLOW;
+
+ return bind_socket(ctx);
+}
+
+SEC("cgroup/bind6")
+int sd_bind6(struct bpf_sock_addr *ctx) {
+ if (ctx->user_family != AF_INET6 || ctx->family != AF_INET6)
+ return SOCKET_BIND_ALLOW;
+
+ return bind_socket(ctx);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
new file mode 100644
index 0000000..570b39f
--- /dev/null
+++ b/src/core/cgroup.c
@@ -0,0 +1,4272 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "bpf-devices.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "btrfs-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "in-addr-prefix-util.h"
+#include "inotify-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "procfs-util.h"
+#include "restrict-ifaces.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "virt.h"
+
+#if BPF_FRAMEWORK
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+#endif
+
+#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
+
+/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
+ * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
+ * out specific attributes from us. */
+#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
+
+uint64_t tasks_max_resolve(const TasksMax *tasks_max) {
+ if (tasks_max->scale == 0)
+ return tasks_max->value;
+
+ return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
+}
+
+bool manager_owns_host_root_cgroup(Manager *m) {
+ assert(m);
+
+ /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
+ * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
+ * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
+ * we run in any kind of container virtualization. */
+
+ if (MANAGER_IS_USER(m))
+ return false;
+
+ if (detect_container() > 0)
+ return false;
+
+ return empty_or_root(m->cgroup_root);
+}
+
+bool unit_has_startup_cgroup_constraints(Unit *u) {
+ assert(u);
+
+ /* Returns true if this unit has any directives which apply during
+ * startup/shutdown phases. */
+
+ CGroupContext *c;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->startup_cpuset_cpus.set ||
+ c->startup_cpuset_mems.set;
+}
+
+bool unit_has_host_root_cgroup(Unit *u) {
+ assert(u);
+
+ /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
+ * the manager manages the root cgroup. */
+
+ if (!manager_owns_host_root_cgroup(u->manager))
+ return false;
+
+ return unit_has_name(u, SPECIAL_ROOT_SLICE);
+}
+
+static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
+ int r;
+
+ r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
+ strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
+
+ return r;
+}
+
+static void cgroup_compat_warn(void) {
+ static bool cgroup_compat_warned = false;
+
+ if (cgroup_compat_warned)
+ return;
+
+ log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
+ "See cgroup-compat debug messages for details.");
+
+ cgroup_compat_warned = true;
+}
+
+#define log_cgroup_compat(unit, fmt, ...) do { \
+ cgroup_compat_warn(); \
+ log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
+ } while (false)
+
+void cgroup_context_init(CGroupContext *c) {
+ assert(c);
+
+ /* Initialize everything to the kernel defaults. */
+
+ *c = (CGroupContext) {
+ .cpu_weight = CGROUP_WEIGHT_INVALID,
+ .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
+ .cpu_quota_per_sec_usec = USEC_INFINITY,
+ .cpu_quota_period_usec = USEC_INFINITY,
+
+ .cpu_shares = CGROUP_CPU_SHARES_INVALID,
+ .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
+
+ .memory_high = CGROUP_LIMIT_MAX,
+ .memory_max = CGROUP_LIMIT_MAX,
+ .memory_swap_max = CGROUP_LIMIT_MAX,
+
+ .memory_limit = CGROUP_LIMIT_MAX,
+
+ .io_weight = CGROUP_WEIGHT_INVALID,
+ .startup_io_weight = CGROUP_WEIGHT_INVALID,
+
+ .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+ .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+
+ .tasks_max = TASKS_MAX_UNSET,
+
+ .moom_swap = MANAGED_OOM_AUTO,
+ .moom_mem_pressure = MANAGED_OOM_AUTO,
+ .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
+ };
+}
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
+ assert(c);
+ assert(a);
+
+ LIST_REMOVE(device_allow, c->device_allow, a);
+ free(a->path);
+ free(a);
+}
+
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
+ assert(c);
+ assert(w);
+
+ LIST_REMOVE(device_weights, c->io_device_weights, w);
+ free(w->path);
+ free(w);
+}
+
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
+ assert(c);
+ assert(l);
+
+ LIST_REMOVE(device_latencies, c->io_device_latencies, l);
+ free(l->path);
+ free(l);
+}
+
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
+ assert(c);
+ assert(l);
+
+ LIST_REMOVE(device_limits, c->io_device_limits, l);
+ free(l->path);
+ free(l);
+}
+
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
+ assert(c);
+ assert(w);
+
+ LIST_REMOVE(device_weights, c->blockio_device_weights, w);
+ free(w->path);
+ free(w);
+}
+
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
+ assert(c);
+ assert(b);
+
+ LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
+ free(b->path);
+ free(b);
+}
+
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
+ assert(c);
+ assert(p);
+
+ LIST_REMOVE(programs, c->bpf_foreign_programs, p);
+ free(p->bpffs_path);
+ free(p);
+}
+
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
+ assert(head);
+
+ while (*head) {
+ CGroupSocketBindItem *h = *head;
+ LIST_REMOVE(socket_bind_items, *head, h);
+ free(h);
+ }
+}
+
+void cgroup_context_done(CGroupContext *c) {
+ assert(c);
+
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ while (c->io_device_limits)
+ cgroup_context_free_io_device_limit(c, c->io_device_limits);
+
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ while (c->blockio_device_bandwidths)
+ cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
+
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ cgroup_context_remove_socket_bind(&c->socket_bind_allow);
+ cgroup_context_remove_socket_bind(&c->socket_bind_deny);
+
+ c->ip_address_allow = set_free(c->ip_address_allow);
+ c->ip_address_deny = set_free(c->ip_address_deny);
+
+ c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
+ c->ip_filters_egress = strv_free(c->ip_filters_egress);
+
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+ cpu_set_reset(&c->cpuset_cpus);
+ cpu_set_reset(&c->startup_cpuset_cpus);
+ cpu_set_reset(&c->cpuset_mems);
+ cpu_set_reset(&c->startup_cpuset_mems);
+}
+
+static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
+ assert(u);
+
+ if (!u->cgroup_realized)
+ return -EOWNERDEAD;
+
+ return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
+}
+
+static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
+ CGroupContext *c;
+ CGroupMask m;
+ const char *file;
+ uint64_t unit_value;
+ int r;
+
+ /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
+ * return -ENODATA) on cgroup v1.
+ *
+ * Returns:
+ *
+ * <0: On error.
+ * 0: If the kernel memory setting doesn't match our configuration.
+ * >0: If the kernel memory setting matches our configuration.
+ *
+ * The following values are only guaranteed to be populated on return >=0:
+ *
+ * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
+ * - ret_kernel_value will contain the actual value presented by the kernel. */
+
+ assert(u);
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
+
+ /* Unsupported on v1.
+ *
+ * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
+ * silently masked the controller. */
+ if (r == 0)
+ return -ENODATA;
+
+ /* The root slice doesn't have any controller files, so we can't compare anything. */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return -ENODATA;
+
+ /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
+ * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
+ * avoid specious errors in these scenarios, check that we even expect the memory controller to be
+ * enabled at all. */
+ m = unit_get_target_mask(u);
+ if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
+ return -ENODATA;
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ if (streq(property_name, "MemoryLow")) {
+ unit_value = unit_get_ancestor_memory_low(u);
+ file = "memory.low";
+ } else if (streq(property_name, "MemoryMin")) {
+ unit_value = unit_get_ancestor_memory_min(u);
+ file = "memory.min";
+ } else if (streq(property_name, "MemoryHigh")) {
+ unit_value = c->memory_high;
+ file = "memory.high";
+ } else if (streq(property_name, "MemoryMax")) {
+ unit_value = c->memory_max;
+ file = "memory.max";
+ } else if (streq(property_name, "MemorySwapMax")) {
+ unit_value = c->memory_swap_max;
+ file = "memory.swap.max";
+ } else
+ return -EINVAL;
+
+ r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
+
+ /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
+ * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
+ * our internal page-counting. To support those future kernels, just check the value itself first
+ * without any page-alignment. */
+ if (*ret_kernel_value == unit_value) {
+ *ret_unit_value = unit_value;
+ return 1;
+ }
+
+ /* The current kernel behaviour, by comparison, is that even if you write a particular number of
+ * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
+ * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
+ * cricket. */
+ if (unit_value != CGROUP_LIMIT_MAX)
+ unit_value = PAGE_ALIGN_DOWN(unit_value);
+
+ *ret_unit_value = unit_value;
+
+ return *ret_kernel_value == *ret_unit_value;
+}
+
+#define FORMAT_CGROUP_DIFF_MAX 128
+
+static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
+ uint64_t kval, sval;
+ int r;
+
+ assert(u);
+ assert(buf);
+ assert(l > 0);
+
+ r = unit_compare_memory_limit(u, property_name, &sval, &kval);
+
+ /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
+ * In the absence of reliably being able to detect whether memcg swap support is available or not,
+ * only complain if the error is not ENOENT. */
+ if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
+ (r == -ENOENT && streq(property_name, "MemorySwapMax")))
+ buf[0] = 0;
+ else if (r < 0) {
+ errno = -r;
+ (void) snprintf(buf, l, " (error getting kernel value: %m)");
+ } else
+ (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
+
+ return buf;
+}
+
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
+ _cleanup_free_ char *disable_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
+ CGroupContext *c;
+ struct in_addr_prefix *iaai;
+
+ char cda[FORMAT_CGROUP_DIFF_MAX];
+ char cdb[FORMAT_CGROUP_DIFF_MAX];
+ char cdc[FORMAT_CGROUP_DIFF_MAX];
+ char cdd[FORMAT_CGROUP_DIFF_MAX];
+ char cde[FORMAT_CGROUP_DIFF_MAX];
+
+ assert(u);
+ assert(f);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ prefix = strempty(prefix);
+
+ (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+
+ cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
+ startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
+ cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
+ startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
+
+ fprintf(f,
+ "%sCPUAccounting: %s\n"
+ "%sIOAccounting: %s\n"
+ "%sBlockIOAccounting: %s\n"
+ "%sMemoryAccounting: %s\n"
+ "%sTasksAccounting: %s\n"
+ "%sIPAccounting: %s\n"
+ "%sCPUWeight: %" PRIu64 "\n"
+ "%sStartupCPUWeight: %" PRIu64 "\n"
+ "%sCPUShares: %" PRIu64 "\n"
+ "%sStartupCPUShares: %" PRIu64 "\n"
+ "%sCPUQuotaPerSecSec: %s\n"
+ "%sCPUQuotaPeriodSec: %s\n"
+ "%sAllowedCPUs: %s\n"
+ "%sStartupAllowedCPUs: %s\n"
+ "%sAllowedMemoryNodes: %s\n"
+ "%sStartupAllowedMemoryNodes: %s\n"
+ "%sIOWeight: %" PRIu64 "\n"
+ "%sStartupIOWeight: %" PRIu64 "\n"
+ "%sBlockIOWeight: %" PRIu64 "\n"
+ "%sStartupBlockIOWeight: %" PRIu64 "\n"
+ "%sDefaultMemoryMin: %" PRIu64 "\n"
+ "%sDefaultMemoryLow: %" PRIu64 "\n"
+ "%sMemoryMin: %" PRIu64 "%s\n"
+ "%sMemoryLow: %" PRIu64 "%s\n"
+ "%sMemoryHigh: %" PRIu64 "%s\n"
+ "%sMemoryMax: %" PRIu64 "%s\n"
+ "%sMemorySwapMax: %" PRIu64 "%s\n"
+ "%sMemoryLimit: %" PRIu64 "\n"
+ "%sTasksMax: %" PRIu64 "\n"
+ "%sDevicePolicy: %s\n"
+ "%sDisableControllers: %s\n"
+ "%sDelegate: %s\n"
+ "%sManagedOOMSwap: %s\n"
+ "%sManagedOOMMemoryPressure: %s\n"
+ "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
+ "%sManagedOOMPreference: %s\n",
+ prefix, yes_no(c->cpu_accounting),
+ prefix, yes_no(c->io_accounting),
+ prefix, yes_no(c->blockio_accounting),
+ prefix, yes_no(c->memory_accounting),
+ prefix, yes_no(c->tasks_accounting),
+ prefix, yes_no(c->ip_accounting),
+ prefix, c->cpu_weight,
+ prefix, c->startup_cpu_weight,
+ prefix, c->cpu_shares,
+ prefix, c->startup_cpu_shares,
+ prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
+ prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
+ prefix, strempty(cpuset_cpus),
+ prefix, strempty(startup_cpuset_cpus),
+ prefix, strempty(cpuset_mems),
+ prefix, strempty(startup_cpuset_mems),
+ prefix, c->io_weight,
+ prefix, c->startup_io_weight,
+ prefix, c->blockio_weight,
+ prefix, c->startup_blockio_weight,
+ prefix, c->default_memory_min,
+ prefix, c->default_memory_low,
+ prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
+ prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
+ prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "MemoryHigh"),
+ prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"),
+ prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"),
+ prefix, c->memory_limit,
+ prefix, tasks_max_resolve(&c->tasks_max),
+ prefix, cgroup_device_policy_to_string(c->device_policy),
+ prefix, strempty(disable_controllers_str),
+ prefix, yes_no(c->delegate),
+ prefix, managed_oom_mode_to_string(c->moom_swap),
+ prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
+ prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
+ prefix, managed_oom_preference_to_string(c->moom_preference));
+
+ if (c->delegate) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) cg_mask_to_string(c->delegate_controllers, &t);
+
+ fprintf(f, "%sDelegateControllers: %s\n",
+ prefix,
+ strempty(t));
+ }
+
+ LIST_FOREACH(device_allow, a, c->device_allow)
+ fprintf(f,
+ "%sDeviceAllow: %s %s%s%s\n",
+ prefix,
+ a->path,
+ a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
+
+ LIST_FOREACH(device_weights, iw, c->io_device_weights)
+ fprintf(f,
+ "%sIODeviceWeight: %s %" PRIu64 "\n",
+ prefix,
+ iw->path,
+ iw->weight);
+
+ LIST_FOREACH(device_latencies, l, c->io_device_latencies)
+ fprintf(f,
+ "%sIODeviceLatencyTargetSec: %s %s\n",
+ prefix,
+ l->path,
+ FORMAT_TIMESPAN(l->target_usec, 1));
+
+ LIST_FOREACH(device_limits, il, c->io_device_limits)
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ if (il->limits[type] != cgroup_io_limit_defaults[type])
+ fprintf(f,
+ "%s%s: %s %s\n",
+ prefix,
+ cgroup_io_limit_type_to_string(type),
+ il->path,
+ FORMAT_BYTES(il->limits[type]));
+
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+ fprintf(f,
+ "%sBlockIODeviceWeight: %s %" PRIu64,
+ prefix,
+ w->path,
+ w->weight);
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ if (b->rbps != CGROUP_LIMIT_MAX)
+ fprintf(f,
+ "%sBlockIOReadBandwidth: %s %s\n",
+ prefix,
+ b->path,
+ FORMAT_BYTES(b->rbps));
+ if (b->wbps != CGROUP_LIMIT_MAX)
+ fprintf(f,
+ "%sBlockIOWriteBandwidth: %s %s\n",
+ prefix,
+ b->path,
+ FORMAT_BYTES(b->wbps));
+ }
+
+ SET_FOREACH(iaai, c->ip_address_allow)
+ fprintf(f, "%sIPAddressAllow: %s\n", prefix,
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+ SET_FOREACH(iaai, c->ip_address_deny)
+ fprintf(f, "%sIPAddressDeny: %s\n", prefix,
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+
+ STRV_FOREACH(path, c->ip_filters_ingress)
+ fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
+ STRV_FOREACH(path, c->ip_filters_egress)
+ fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
+
+ LIST_FOREACH(programs, p, c->bpf_foreign_programs)
+ fprintf(f, "%sBPFProgram: %s:%s",
+ prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
+
+ if (c->socket_bind_allow) {
+ fprintf(f, "%sSocketBindAllow: ", prefix);
+ cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
+ fputc('\n', f);
+ }
+
+ if (c->socket_bind_deny) {
+ fprintf(f, "%sSocketBindDeny: ", prefix);
+ cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
+ fputc('\n', f);
+ }
+
+ if (c->restrict_network_interfaces) {
+ char *iface;
+ SET_FOREACH(iface, c->restrict_network_interfaces)
+ fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
+ }
+}
+
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
+ const char *family, *colon1, *protocol = "", *colon2 = "";
+
+ family = strempty(af_to_ipv4_ipv6(item->address_family));
+ colon1 = isempty(family) ? "" : ":";
+
+ if (item->ip_protocol != 0) {
+ protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
+ colon2 = ":";
+ }
+
+ if (item->nr_ports == 0)
+ fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
+ else if (item->nr_ports == 1)
+ fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
+ else {
+ uint16_t port_max = item->port_min + item->nr_ports - 1;
+ fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
+ item->port_min, port_max);
+ }
+}
+
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
+ bool first = true;
+
+ LIST_FOREACH(socket_bind_items, bi, items) {
+ if (first)
+ first = false;
+ else
+ fputc(' ', f);
+
+ cgroup_context_dump_socket_bind_item(bi, f);
+ }
+}
+
+int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
+ _cleanup_free_ CGroupDeviceAllow *a = NULL;
+ _cleanup_free_ char *d = NULL;
+
+ assert(c);
+ assert(dev);
+ assert(isempty(mode) || in_charset(mode, "rwm"));
+
+ a = new(CGroupDeviceAllow, 1);
+ if (!a)
+ return -ENOMEM;
+
+ d = strdup(dev);
+ if (!d)
+ return -ENOMEM;
+
+ *a = (CGroupDeviceAllow) {
+ .path = TAKE_PTR(d),
+ .r = isempty(mode) || strchr(mode, 'r'),
+ .w = isempty(mode) || strchr(mode, 'w'),
+ .m = isempty(mode) || strchr(mode, 'm'),
+ };
+
+ LIST_PREPEND(device_allow, c->device_allow, a);
+ TAKE_PTR(a);
+
+ return 0;
+}
+
+int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
+ CGroupBPFForeignProgram *p;
+ _cleanup_free_ char *d = NULL;
+
+ assert(c);
+ assert(bpffs_path);
+
+ if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
+
+ d = strdup(bpffs_path);
+ if (!d)
+ return log_oom();
+
+ p = new(CGroupBPFForeignProgram, 1);
+ if (!p)
+ return log_oom();
+
+ *p = (CGroupBPFForeignProgram) {
+ .attach_type = attach_type,
+ .bpffs_path = TAKE_PTR(d),
+ };
+
+ LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
+
+ return 0;
+}
+
+#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
+ uint64_t unit_get_ancestor_##entry(Unit *u) { \
+ CGroupContext *c; \
+ \
+ /* 1. Is entry set in this unit? If so, use that. \
+ * 2. Is the default for this entry set in any \
+ * ancestor? If so, use that. \
+ * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
+ \
+ assert(u); \
+ \
+ c = unit_get_cgroup_context(u); \
+ if (c && c->entry##_set) \
+ return c->entry; \
+ \
+ while ((u = UNIT_GET_SLICE(u))) { \
+ c = unit_get_cgroup_context(u); \
+ if (c && c->default_##entry##_set) \
+ return c->default_##entry; \
+ } \
+ \
+ /* We've reached the root, but nobody had default for \
+ * this entry set, so set it to the kernel default. */ \
+ return CGROUP_LIMIT_MIN; \
+}
+
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
+
+static void unit_set_xattr_graceful(Unit *u, const char *cgroup_path, const char *name, const void *data, size_t size) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (!cgroup_path) {
+ if (!u->cgroup_path)
+ return;
+
+ cgroup_path = u->cgroup_path;
+ }
+
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, name, data, size, 0);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(cgroup_path));
+}
+
+static void unit_remove_xattr_graceful(Unit *u, const char *cgroup_path, const char *name) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (!cgroup_path) {
+ if (!u->cgroup_path)
+ return;
+
+ cgroup_path = u->cgroup_path;
+ }
+
+ r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, name);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(cgroup_path));
+}
+
+void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) {
+ CGroupContext *c;
+
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return;
+
+ if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
+ unit_set_xattr_graceful(u, cgroup_path, "user.oomd_omit", "1", 1);
+
+ if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
+ unit_set_xattr_graceful(u, cgroup_path, "user.oomd_avoid", "1", 1);
+
+ if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
+ unit_remove_xattr_graceful(u, cgroup_path, "user.oomd_avoid");
+
+ if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
+ unit_remove_xattr_graceful(u, cgroup_path, "user.oomd_omit");
+}
+
+static void cgroup_xattr_apply(Unit *u) {
+ bool b;
+
+ assert(u);
+
+ /* The 'user.*' xattrs can be set from a user manager. */
+ cgroup_oomd_xattr_apply(u, u->cgroup_path);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ b = !sd_id128_is_null(u->invocation_id);
+ FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
+ if (b)
+ unit_set_xattr_graceful(u, NULL, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
+ else
+ unit_remove_xattr_graceful(u, NULL, xn);
+ }
+
+ /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
+ * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
+ * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
+ * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
+ * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
+ * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
+ * be a big problem given this communicates delegation state to clients, but the manager never reads
+ * it. */
+ b = unit_cgroup_delegate(u);
+ FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
+ if (b)
+ unit_set_xattr_graceful(u, NULL, xn, "1", 1);
+ else
+ unit_remove_xattr_graceful(u, NULL, xn);
+ }
+}
+
+static int lookup_block_device(const char *p, dev_t *ret) {
+ dev_t rdev, dev = 0;
+ mode_t mode;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ r = device_path_parse_major_minor(p, &mode, &rdev);
+ if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
+ struct stat st;
+
+ if (stat(p, &st) < 0)
+ return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
+
+ mode = st.st_mode;
+ rdev = st.st_rdev;
+ dev = st.st_dev;
+ } else if (r < 0)
+ return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
+
+ if (S_ISCHR(mode))
+ return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
+ "Device node '%s' is a character device, but block device needed.", p);
+ if (S_ISBLK(mode))
+ *ret = rdev;
+ else if (major(dev) != 0)
+ *ret = dev; /* If this is not a device node then use the block device this file is stored on */
+ else {
+ /* If this is btrfs, getting the backing block device is a bit harder */
+ r = btrfs_get_block_device(p, ret);
+ if (r == -ENOTTY)
+ return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
+ "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
+ }
+
+ /* If this is a LUKS/DM device, recursively try to get the originating block device */
+ while (block_get_originating(*ret, ret) > 0);
+
+ /* If this is a partition, try to get the originating block device */
+ (void) block_get_whole_disk(*ret, ret);
+ return 0;
+}
+
+static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
+ return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
+}
+
+static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
+ return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+ c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
+}
+
+static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
+ return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
+}
+
+static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
+ return c->cpuset_mems.set || c->startup_cpuset_mems.set;
+}
+
+static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
+ return c->startup_cpu_weight;
+ else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
+ return c->cpu_weight;
+ else
+ return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
+ return c->startup_cpu_shares;
+ else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
+ return c->cpu_shares;
+ else
+ return CGROUP_CPU_SHARES_DEFAULT;
+}
+
+static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpuset_cpus.set)
+ return &c->startup_cpuset_cpus;
+ else
+ return &c->cpuset_cpus;
+}
+
+static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpuset_mems.set)
+ return &c->startup_cpuset_mems;
+ else
+ return &c->cpuset_mems;
+}
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
+ /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
+ * need to be higher than that boundary. quota is specified in USecPerSec.
+ * Additionally, period must be at most max_period. */
+ assert(quota > 0);
+
+ return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
+}
+
+static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
+ usec_t new_period;
+
+ if (quota == USEC_INFINITY)
+ /* Always use default period for infinity quota. */
+ return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+ if (period == USEC_INFINITY)
+ /* Default period was requested. */
+ period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+ /* Clamp to interval [1ms, 1s] */
+ new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
+
+ if (new_period != period) {
+ log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
+ "Clamping CPU interval for cpu.max: period is now %s",
+ FORMAT_TIMESPAN(new_period, 1));
+ u->warned_clamping_cpu_quota_period = true;
+ }
+
+ return new_period;
+}
+
+static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+ if (weight == CGROUP_WEIGHT_IDLE)
+ return;
+ xsprintf(buf, "%" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
+}
+
+static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
+ int r;
+ bool is_idle;
+ const char *idle_val;
+
+ is_idle = weight == CGROUP_WEIGHT_IDLE;
+ idle_val = one_zero(is_idle);
+ r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val);
+ if (r < 0 && (r != -ENOENT || is_idle))
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
+ "cpu.idle", empty_to_root(u->cgroup_path), idle_val);
+}
+
+static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+ char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
+
+ period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+ if (quota != USEC_INFINITY)
+ xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
+ MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
+ else
+ xsprintf(buf, "max " USEC_FMT "\n", period);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
+}
+
+static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+ xsprintf(buf, "%" PRIu64 "\n", shares);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
+}
+
+static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+ char buf[DECIMAL_STR_MAX(usec_t) + 2];
+
+ period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+
+ xsprintf(buf, USEC_FMT "\n", period);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
+
+ if (quota != USEC_INFINITY) {
+ xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
+ } else
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
+}
+
+static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
+ return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
+ /* we don't support idle in cgroupv1 */
+ if (weight == CGROUP_WEIGHT_IDLE)
+ return CGROUP_CPU_SHARES_MIN;
+
+ return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+ CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
+}
+
+static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
+ _cleanup_free_ char *buf = NULL;
+
+ buf = cpu_set_to_range_string(cpus);
+ if (!buf) {
+ log_oom();
+ return;
+ }
+
+ (void) set_attribute_and_warn(u, "cpuset", name, buf);
+}
+
+static bool cgroup_context_has_io_config(CGroupContext *c) {
+ return c->io_accounting ||
+ c->io_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+ c->io_device_weights ||
+ c->io_device_latencies ||
+ c->io_device_limits;
+}
+
+static bool cgroup_context_has_blockio_config(CGroupContext *c) {
+ return c->blockio_accounting ||
+ c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->blockio_device_weights ||
+ c->blockio_device_bandwidths;
+}
+
+static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID)
+ return c->startup_io_weight;
+ if (c->io_weight != CGROUP_WEIGHT_INVALID)
+ return c->io_weight;
+ return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+ return c->startup_blockio_weight;
+ if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+ return c->blockio_weight;
+ return CGROUP_BLKIO_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
+ return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
+ return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+ CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
+}
+
+static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
+ static const char * const prop_names[] = {
+ "IOWeight",
+ "BlockIOWeight",
+ "IODeviceWeight",
+ "BlockIODeviceWeight",
+ };
+ static bool warned = false;
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
+ const char *p;
+ uint64_t bfq_weight;
+ int r;
+
+ /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
+ * See also: https://github.com/systemd/systemd/pull/13335 and
+ * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
+ p = strjoina(controller, ".bfq.weight");
+ /* Adjust to kernel range is 1..1000, the default is 100. */
+ bfq_weight = BFQ_WEIGHT(io_weight);
+
+ if (major(dev) > 0)
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
+ else
+ xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
+
+ r = cg_set_attribute(controller, u->cgroup_path, p, buf);
+
+ /* FIXME: drop this when kernels prior
+ * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
+ * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
+ * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
+ if (r == -EINVAL && major(dev) > 0) {
+ if (!warned) {
+ log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
+ warned = true;
+ }
+ r = -EOPNOTSUPP; /* mask as unconfigured device */
+ } else if (r >= 0 && io_weight != bfq_weight)
+ log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
+ prop_names[2*(major(dev) > 0) + streq(controller, "blkio")],
+ io_weight, p, bfq_weight);
+ return r;
+}
+
+static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r, r1, r2;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ r1 = set_bfq_weight(u, "io", dev, io_weight);
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
+ r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
+
+ /* Look at the configured device, when both fail, prefer io.weight errno. */
+ r = r2 == -EOPNOTSUPP ? r1 : r2;
+
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
+ r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
+ empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
+}
+
+static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r;
+
+ r = lookup_block_device(dev_path, &dev);
+ if (r < 0)
+ return;
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
+}
+
+static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r;
+
+ r = lookup_block_device(dev_path, &dev);
+ if (r < 0)
+ return;
+
+ if (target != USEC_INFINITY)
+ xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
+ else
+ xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
+
+ (void) set_attribute_and_warn(u, "io", "io.latency", buf);
+}
+
+static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
+ char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
+ buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
+ dev_t dev;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ if (limits[type] != cgroup_io_limit_defaults[type])
+ xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
+ else
+ xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
+ limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
+ limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
+ (void) set_attribute_and_warn(u, "io", "io.max", buf);
+}
+
+static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
+
+ sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
+}
+
+static bool unit_has_unified_memory_config(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ return unit_get_ancestor_memory_min(u) > 0 || unit_get_ancestor_memory_low(u) > 0 ||
+ c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
+ c->memory_swap_max != CGROUP_LIMIT_MAX;
+}
+
+static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
+
+ if (v != CGROUP_LIMIT_MAX)
+ xsprintf(buf, "%" PRIu64 "\n", v);
+
+ (void) set_attribute_and_warn(u, "memory", file, buf);
+}
+
+static void cgroup_apply_firewall(Unit *u) {
+ assert(u);
+
+ /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
+
+ if (bpf_firewall_compile(u) < 0)
+ return;
+
+ (void) bpf_firewall_load_custom(u);
+ (void) bpf_firewall_install(u);
+}
+
+static void cgroup_apply_socket_bind(Unit *u) {
+ assert(u);
+
+ (void) bpf_socket_bind_install(u);
+}
+
+static void cgroup_apply_restrict_network_interfaces(Unit *u) {
+ assert(u);
+
+ (void) restrict_network_interfaces_install(u);
+}
+
+static int cgroup_apply_devices(Unit *u) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ const char *path;
+ CGroupContext *c;
+ CGroupDevicePolicy policy;
+ int r;
+
+ assert_se(c = unit_get_cgroup_context(u));
+ assert_se(path = u->cgroup_path);
+
+ policy = c->device_policy;
+
+ if (cg_all_unified() > 0) {
+ r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
+ if (r < 0)
+ return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
+
+ } else {
+ /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+ * EINVAL here. */
+
+ if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
+ r = cg_set_attribute("devices", path, "devices.deny", "a");
+ else
+ r = cg_set_attribute("devices", path, "devices.allow", "a");
+ if (r < 0)
+ log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to reset devices.allow/devices.deny: %m");
+ }
+
+ bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
+ (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
+ if (allow_list_static)
+ (void) bpf_devices_allow_list_static(prog, path);
+
+ bool any = allow_list_static;
+ LIST_FOREACH(device_allow, a, c->device_allow) {
+ char acc[4], *val;
+ unsigned k = 0;
+
+ if (a->r)
+ acc[k++] = 'r';
+ if (a->w)
+ acc[k++] = 'w';
+ if (a->m)
+ acc[k++] = 'm';
+ if (k == 0)
+ continue;
+ acc[k++] = 0;
+
+ if (path_startswith(a->path, "/dev/"))
+ r = bpf_devices_allow_list_device(prog, path, a->path, acc);
+ else if ((val = startswith(a->path, "block-")))
+ r = bpf_devices_allow_list_major(prog, path, val, 'b', acc);
+ else if ((val = startswith(a->path, "char-")))
+ r = bpf_devices_allow_list_major(prog, path, val, 'c', acc);
+ else {
+ log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
+ continue;
+ }
+
+ if (r >= 0)
+ any = true;
+ }
+
+ if (prog && !any) {
+ log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
+
+ /* The kernel verifier would reject a program we would build with the normal intro and outro
+ but no allow-listing rules (outro would contain an unreachable instruction for successful
+ return). */
+ policy = CGROUP_DEVICE_POLICY_STRICT;
+ }
+
+ r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed);
+ if (r < 0) {
+ static bool warned = false;
+
+ log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
+ "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
+ "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
+ "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
+
+ warned = true;
+ }
+ return r;
+}
+
+static void set_io_weight(Unit *u, uint64_t weight) {
+ char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
+
+ assert(u);
+
+ (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
+
+ xsprintf(buf, "default %" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "io", "io.weight", buf);
+}
+
+static void set_blkio_weight(Unit *u, uint64_t weight) {
+ char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)];
+
+ assert(u);
+
+ (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight);
+
+ xsprintf(buf, "%" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
+}
+
+static void cgroup_apply_bpf_foreign_program(Unit *u) {
+ assert(u);
+
+ (void) bpf_foreign_install(u);
+}
+
+static void cgroup_context_apply(
+ Unit *u,
+ CGroupMask apply_mask,
+ ManagerState state) {
+
+ const char *path;
+ CGroupContext *c;
+ bool is_host_root, is_local_root;
+ int r;
+
+ assert(u);
+
+ /* Nothing to do? Exit early! */
+ if (apply_mask == 0)
+ return;
+
+ /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
+ * attributes should only be managed for cgroups further down the tree. */
+ is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
+ is_host_root = unit_has_host_root_cgroup(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+ assert_se(path = u->cgroup_path);
+
+ if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
+ path = "/";
+
+ /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
+ * then), and missing cgroups, i.e. EROFS and ENOENT. */
+
+ /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
+ * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
+ * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
+ * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
+ * we couldn't even write to them if we wanted to). */
+ if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
+
+ if (cg_all_unified() > 0) {
+ uint64_t weight;
+
+ if (cgroup_context_has_cpu_weight(c))
+ weight = cgroup_context_cpu_weight(c, state);
+ else if (cgroup_context_has_cpu_shares(c)) {
+ uint64_t shares;
+
+ shares = cgroup_context_cpu_shares(c, state);
+ weight = cgroup_cpu_shares_to_weight(shares);
+
+ log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
+ shares, weight, path);
+ } else
+ weight = CGROUP_WEIGHT_DEFAULT;
+
+ cgroup_apply_unified_cpu_idle(u, weight);
+ cgroup_apply_unified_cpu_weight(u, weight);
+ cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+
+ } else {
+ uint64_t shares;
+
+ if (cgroup_context_has_cpu_weight(c)) {
+ uint64_t weight;
+
+ weight = cgroup_context_cpu_weight(c, state);
+ shares = cgroup_cpu_weight_to_shares(weight);
+
+ log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
+ weight, shares, path);
+ } else if (cgroup_context_has_cpu_shares(c))
+ shares = cgroup_context_cpu_shares(c, state);
+ else
+ shares = CGROUP_CPU_SHARES_DEFAULT;
+
+ cgroup_apply_legacy_cpu_shares(u, shares);
+ cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+ }
+ }
+
+ if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
+ cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
+ cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
+ }
+
+ /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
+ * controller), and in case of containers we want to leave control of these attributes to the container manager
+ * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
+ if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
+ bool has_io, has_blockio;
+ uint64_t weight;
+
+ has_io = cgroup_context_has_io_config(c);
+ has_blockio = cgroup_context_has_blockio_config(c);
+
+ if (has_io)
+ weight = cgroup_context_io_weight(c, state);
+ else if (has_blockio) {
+ uint64_t blkio_weight;
+
+ blkio_weight = cgroup_context_blkio_weight(c, state);
+ weight = cgroup_weight_blkio_to_io(blkio_weight);
+
+ log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
+ blkio_weight, weight);
+ } else
+ weight = CGROUP_WEIGHT_DEFAULT;
+
+ set_io_weight(u, weight);
+
+ if (has_io) {
+ LIST_FOREACH(device_weights, w, c->io_device_weights)
+ cgroup_apply_io_device_weight(u, w->path, w->weight);
+
+ LIST_FOREACH(device_limits, limit, c->io_device_limits)
+ cgroup_apply_io_device_limit(u, limit->path, limit->limits);
+
+ LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
+ cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
+
+ } else if (has_blockio) {
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+ weight = cgroup_weight_blkio_to_io(w->weight);
+
+ log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
+ w->weight, weight, w->path);
+
+ cgroup_apply_io_device_weight(u, w->path, weight);
+ }
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ limits[type] = cgroup_io_limit_defaults[type];
+
+ limits[CGROUP_IO_RBPS_MAX] = b->rbps;
+ limits[CGROUP_IO_WBPS_MAX] = b->wbps;
+
+ log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
+ b->rbps, b->wbps, b->path);
+
+ cgroup_apply_io_device_limit(u, b->path, limits);
+ }
+ }
+ }
+
+ if (apply_mask & CGROUP_MASK_BLKIO) {
+ bool has_io, has_blockio;
+
+ has_io = cgroup_context_has_io_config(c);
+ has_blockio = cgroup_context_has_blockio_config(c);
+
+ /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
+ * left to our container manager, too. */
+ if (!is_local_root) {
+ uint64_t weight;
+
+ if (has_io) {
+ uint64_t io_weight;
+
+ io_weight = cgroup_context_io_weight(c, state);
+ weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
+
+ log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
+ io_weight, weight);
+ } else if (has_blockio)
+ weight = cgroup_context_blkio_weight(c, state);
+ else
+ weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
+
+ set_blkio_weight(u, weight);
+
+ if (has_io)
+ LIST_FOREACH(device_weights, w, c->io_device_weights) {
+ weight = cgroup_weight_io_to_blkio(w->weight);
+
+ log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
+ w->weight, weight, w->path);
+
+ cgroup_apply_blkio_device_weight(u, w->path, weight);
+ }
+ else if (has_blockio)
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+ cgroup_apply_blkio_device_weight(u, w->path, w->weight);
+ }
+
+ /* The bandwidth limits are something that make sense to be applied to the host's root but not container
+ * roots, as there we want the container manager to handle it */
+ if (is_host_root || !is_local_root) {
+ if (has_io)
+ LIST_FOREACH(device_limits, l, c->io_device_limits) {
+ log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
+ l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
+
+ cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
+ }
+ else if (has_blockio)
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
+ }
+ }
+
+ /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
+ * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
+ * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
+ * write to this if we wanted to.) */
+ if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
+
+ if (cg_all_unified() > 0) {
+ uint64_t max, swap_max = CGROUP_LIMIT_MAX;
+
+ if (unit_has_unified_memory_config(u)) {
+ max = c->memory_max;
+ swap_max = c->memory_swap_max;
+ } else {
+ max = c->memory_limit;
+
+ if (max != CGROUP_LIMIT_MAX)
+ log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
+ }
+
+ cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
+ cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
+ cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
+ cgroup_apply_unified_memory_limit(u, "memory.max", max);
+ cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
+
+ (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
+
+ } else {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+ uint64_t val;
+
+ if (unit_has_unified_memory_config(u)) {
+ val = c->memory_max;
+ log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val);
+ } else
+ val = c->memory_limit;
+
+ if (val == CGROUP_LIMIT_MAX)
+ strncpy(buf, "-1\n", sizeof(buf));
+ else
+ xsprintf(buf, "%" PRIu64 "\n", val);
+
+ (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
+ }
+ }
+
+ /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
+ * containers, where we leave this to the manager */
+ if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
+ (is_host_root || cg_all_unified() > 0 || !is_local_root))
+ (void) cgroup_apply_devices(u);
+
+ if (apply_mask & CGROUP_MASK_PIDS) {
+
+ if (is_host_root) {
+ /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
+ * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
+ * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
+ * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
+ * exclusive ownership of the sysctls, but we still want to honour things if the user sets
+ * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
+ * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
+ * it also counts. But if the user never set a limit through us (i.e. we are the default of
+ * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
+ * the first time we set a limit. Note that this boolean is flushed out on manager reload,
+ * which is desirable so that there's an official way to release control of the sysctl from
+ * systemd: set the limit to unbounded and reload. */
+
+ if (tasks_max_isset(&c->tasks_max)) {
+ u->manager->sysctl_pid_max_changed = true;
+ r = procfs_tasks_set_limit(tasks_max_resolve(&c->tasks_max));
+ } else if (u->manager->sysctl_pid_max_changed)
+ r = procfs_tasks_set_limit(TASKS_MAX);
+ else
+ r = 0;
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
+ "Failed to write to tasks limit sysctls: %m");
+ }
+
+ /* The attribute itself is not available on the host root cgroup, and in the container case we want to
+ * leave it for the container manager. */
+ if (!is_local_root) {
+ if (tasks_max_isset(&c->tasks_max)) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+
+ xsprintf(buf, "%" PRIu64 "\n", tasks_max_resolve(&c->tasks_max));
+ (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
+ } else
+ (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
+ }
+ }
+
+ if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
+ cgroup_apply_firewall(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
+ cgroup_apply_bpf_foreign_program(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
+ cgroup_apply_socket_bind(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
+ cgroup_apply_restrict_network_interfaces(u);
+}
+
+static bool unit_get_needs_bpf_firewall(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ if (c->ip_accounting ||
+ !set_isempty(c->ip_address_allow) ||
+ !set_isempty(c->ip_address_deny) ||
+ c->ip_filters_ingress ||
+ c->ip_filters_egress)
+ return true;
+
+ /* If any parent slice has an IP access list defined, it applies too */
+ for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
+ c = unit_get_cgroup_context(p);
+ if (!c)
+ return false;
+
+ if (!set_isempty(c->ip_address_allow) ||
+ !set_isempty(c->ip_address_deny))
+ return true;
+ }
+
+ return false;
+}
+
+static bool unit_get_needs_bpf_foreign_program(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return !!c->bpf_foreign_programs;
+}
+
+static bool unit_get_needs_socket_bind(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->socket_bind_allow || c->socket_bind_deny;
+}
+
+static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return !set_isempty(c->restrict_network_interfaces);
+}
+
+static CGroupMask unit_get_cgroup_mask(Unit *u) {
+ CGroupMask mask = 0;
+ CGroupContext *c;
+
+ assert(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ /* Figure out which controllers we need, based on the cgroup context object */
+
+ if (c->cpu_accounting)
+ mask |= get_cpu_accounting_mask();
+
+ if (cgroup_context_has_cpu_weight(c) ||
+ cgroup_context_has_cpu_shares(c) ||
+ c->cpu_quota_per_sec_usec != USEC_INFINITY)
+ mask |= CGROUP_MASK_CPU;
+
+ if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
+ mask |= CGROUP_MASK_CPUSET;
+
+ if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
+ mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+ if (c->memory_accounting ||
+ c->memory_limit != CGROUP_LIMIT_MAX ||
+ unit_has_unified_memory_config(u))
+ mask |= CGROUP_MASK_MEMORY;
+
+ if (c->device_allow ||
+ c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
+ mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
+
+ if (c->tasks_accounting ||
+ tasks_max_isset(&c->tasks_max))
+ mask |= CGROUP_MASK_PIDS;
+
+ return CGROUP_MASK_EXTEND_JOINED(mask);
+}
+
+static CGroupMask unit_get_bpf_mask(Unit *u) {
+ CGroupMask mask = 0;
+
+ /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
+ * too. */
+
+ if (unit_get_needs_bpf_firewall(u))
+ mask |= CGROUP_MASK_BPF_FIREWALL;
+
+ if (unit_get_needs_bpf_foreign_program(u))
+ mask |= CGROUP_MASK_BPF_FOREIGN;
+
+ if (unit_get_needs_socket_bind(u))
+ mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+ if (unit_get_needs_restrict_network_interfaces(u))
+ mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+ return mask;
+}
+
+CGroupMask unit_get_own_mask(Unit *u) {
+ CGroupContext *c;
+
+ /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
+ * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+}
+
+CGroupMask unit_get_delegate_mask(Unit *u) {
+ CGroupContext *c;
+
+ /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
+ * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
+ *
+ * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
+
+ if (!unit_cgroup_delegate(u))
+ return 0;
+
+ if (cg_all_unified() <= 0) {
+ ExecContext *e;
+
+ e = unit_get_exec_context(u);
+ if (e && !exec_context_maintains_privileges(e))
+ return 0;
+ }
+
+ assert_se(c = unit_get_cgroup_context(u));
+ return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
+}
+
+static CGroupMask unit_get_subtree_mask(Unit *u) {
+
+ /* Returns the mask of this subtree, meaning of the group
+ * itself and its children. */
+
+ return unit_get_own_mask(u) | unit_get_members_mask(u);
+}
+
+CGroupMask unit_get_members_mask(Unit *u) {
+ assert(u);
+
+ /* Returns the mask of controllers all of the unit's children require, merged */
+
+ if (u->cgroup_members_mask_valid)
+ return u->cgroup_members_mask; /* Use cached value if possible */
+
+ u->cgroup_members_mask = 0;
+
+ if (u->type == UNIT_SLICE) {
+ Unit *member;
+
+ UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+ u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+ }
+
+ u->cgroup_members_mask_valid = true;
+ return u->cgroup_members_mask;
+}
+
+CGroupMask unit_get_siblings_mask(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ /* Returns the mask of controllers all of the unit's siblings
+ * require, i.e. the members mask of the unit's parent slice
+ * if there is one. */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ return unit_get_members_mask(slice);
+
+ return unit_get_subtree_mask(u); /* we are the top-level slice */
+}
+
+static CGroupMask unit_get_disable_mask(Unit *u) {
+ CGroupContext *c;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ return c->disable_controllers;
+}
+
+CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+ CGroupMask mask;
+ Unit *slice;
+
+ assert(u);
+ mask = unit_get_disable_mask(u);
+
+ /* Returns the mask of controllers which are marked as forcibly
+ * disabled in any ancestor unit or the unit in question. */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ mask |= unit_get_ancestor_disable_mask(slice);
+
+ return mask;
+}
+
+CGroupMask unit_get_target_mask(Unit *u) {
+ CGroupMask own_mask, mask;
+
+ /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
+ * it needs itself, plus all that its children need, plus all that its siblings need. This is
+ * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
+ * hierarchy that shall be enabled for it. */
+
+ own_mask = unit_get_own_mask(u);
+
+ if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
+ emit_bpf_firewall_warning(u);
+
+ mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+
+ mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
+
+ return mask;
+}
+
+CGroupMask unit_get_enable_mask(Unit *u) {
+ CGroupMask mask;
+
+ /* This returns the cgroup mask of all controllers to enable
+ * for the children of a specific cgroup. This is primarily
+ * useful for the unified cgroup hierarchy, where each cgroup
+ * controls which controllers are enabled for its children. */
+
+ mask = unit_get_members_mask(u);
+ mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
+
+ return mask;
+}
+
+void unit_invalidate_cgroup_members_masks(Unit *u) {
+ Unit *slice;
+
+ assert(u);
+
+ /* Recurse invalidate the member masks cache all the way up the tree */
+ u->cgroup_members_mask_valid = false;
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ unit_invalidate_cgroup_members_masks(slice);
+}
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
+
+ /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
+
+ while (u) {
+
+ if (u->cgroup_path &&
+ u->cgroup_realized &&
+ FLAGS_SET(u->cgroup_realized_mask, mask))
+ return u->cgroup_path;
+
+ u = UNIT_GET_SLICE(u);
+ }
+
+ return NULL;
+}
+
+static const char *migrate_callback(CGroupMask mask, void *userdata) {
+ /* If not realized at all, migrate to root ("").
+ * It may happen if we're upgrading from older version that didn't clean up.
+ */
+ return strempty(unit_get_realized_cgroup_path(userdata, mask));
+}
+
+char *unit_default_cgroup_path(const Unit *u) {
+ _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return strdup(u->manager->cgroup_root);
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
+ r = cg_slice_to_path(slice->id, &slice_path);
+ if (r < 0)
+ return NULL;
+ }
+
+ escaped = cg_escape(u->id);
+ if (!escaped)
+ return NULL;
+
+ return path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
+}
+
+int unit_set_cgroup_path(Unit *u, const char *path) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+
+ if (streq_ptr(u->cgroup_path, path))
+ return 0;
+
+ if (path) {
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+ }
+
+ if (p) {
+ r = hashmap_put(u->manager->cgroup_unit, p, u);
+ if (r < 0)
+ return r;
+ }
+
+ unit_release_cgroup(u);
+ u->cgroup_path = TAKE_PTR(p);
+
+ return 1;
+}
+
+int unit_watch_cgroup(Unit *u) {
+ _cleanup_free_ char *events = NULL;
+ int r;
+
+ assert(u);
+
+ /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
+ * cgroupv2 is available. */
+
+ if (!u->cgroup_path)
+ return 0;
+
+ if (u->cgroup_control_inotify_wd >= 0)
+ return 0;
+
+ /* Only applies to the unified hierarchy */
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
+ if (r == 0)
+ return 0;
+
+ /* No point in watch the top-level slice, it's never going to run empty. */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
+ if (r < 0)
+ return log_oom();
+
+ u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (u->cgroup_control_inotify_wd < 0) {
+
+ if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+ * is not an error */
+ return 0;
+
+ return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ }
+
+ r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+ return 0;
+}
+
+int unit_watch_cgroup_memory(Unit *u) {
+ _cleanup_free_ char *events = NULL;
+ CGroupContext *c;
+ int r;
+
+ assert(u);
+
+ /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
+ * cgroupv2 is available. */
+
+ if (!u->cgroup_path)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
+ * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
+ * all. */
+ if (!c->memory_accounting)
+ return 0;
+
+ /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
+ * we also don't want to generate a log message for each parent cgroup of a process. */
+ if (u->type == UNIT_SLICE)
+ return 0;
+
+ if (u->cgroup_memory_inotify_wd >= 0)
+ return 0;
+
+ /* Only applies to the unified hierarchy */
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
+ if (r == 0)
+ return 0;
+
+ r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
+ if (r < 0)
+ return log_oom();
+
+ u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (u->cgroup_memory_inotify_wd < 0) {
+
+ if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+ * is not an error */
+ return 0;
+
+ return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ }
+
+ r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+ return 0;
+}
+
+int unit_pick_cgroup_path(Unit *u) {
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(u);
+
+ if (u->cgroup_path)
+ return 0;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ path = unit_default_cgroup_path(u);
+ if (!path)
+ return log_oom();
+
+ r = unit_set_cgroup_path(u, path);
+ if (r == -EEXIST)
+ return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
+
+ return 0;
+}
+
+static int unit_update_cgroup(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask,
+ ManagerState state) {
+
+ bool created, is_root_slice;
+ CGroupMask migrate_mask = 0;
+ _cleanup_free_ char *cgroup_full_path = NULL;
+ int r;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* Figure out our cgroup path */
+ r = unit_pick_cgroup_path(u);
+ if (r < 0)
+ return r;
+
+ /* First, create our own group */
+ r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
+ created = r;
+
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+ uint64_t cgroup_id = 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
+ if (r == 0) {
+ r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
+ if (r < 0)
+ log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
+ } else
+ log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ u->cgroup_id = cgroup_id;
+ }
+
+ /* Start watching it */
+ (void) unit_watch_cgroup(u);
+ (void) unit_watch_cgroup_memory(u);
+
+ /* For v2 we preserve enabled controllers in delegated units, adjust others,
+ * for v1 we figure out which controller hierarchies need migration. */
+ if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
+ CGroupMask result_mask = 0;
+
+ /* Enable all controllers we need */
+ r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ /* Remember what's actually enabled now */
+ u->cgroup_enabled_mask = result_mask;
+
+ migrate_mask = u->cgroup_realized_mask ^ target_mask;
+ }
+
+ /* Keep track that this is now realized */
+ u->cgroup_realized = true;
+ u->cgroup_realized_mask = target_mask;
+
+ /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
+ *
+ * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
+ * We perform migration also with whole slices for cases when users don't care about leave
+ * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
+ * delegated units.
+ */
+ if (cg_all_unified() == 0) {
+ r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+ r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ }
+
+ /* Set attributes */
+ cgroup_context_apply(u, target_mask, state);
+ cgroup_xattr_apply(u);
+
+ return 0;
+}
+
+static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ char *pp;
+ int r;
+
+ assert(u);
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ return -EINVAL;
+
+ if (!u->manager->system_bus)
+ return -EIO;
+
+ if (!u->cgroup_path)
+ return -EINVAL;
+
+ /* Determine this unit's cgroup path relative to our cgroup root */
+ pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
+ if (!pp)
+ return -EINVAL;
+
+ pp = strjoina("/", pp, suffix_path);
+ path_simplify(pp);
+
+ r = bus_call_method(u->manager->system_bus,
+ bus_systemd_mgr,
+ "AttachProcessesToUnit",
+ &error, NULL,
+ "ssau",
+ NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
+
+ return 0;
+}
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
+ _cleanup_free_ char *joined = NULL;
+ CGroupMask delegated_mask;
+ const char *p;
+ void *pidp;
+ int ret, r;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ if (set_isempty(pids))
+ return 0;
+
+ /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+ * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+ r = bpf_firewall_load_custom(u);
+ if (r < 0)
+ return r;
+
+ r = unit_realize_cgroup(u);
+ if (r < 0)
+ return r;
+
+ if (isempty(suffix_path))
+ p = u->cgroup_path;
+ else {
+ joined = path_join(u->cgroup_path, suffix_path);
+ if (!joined)
+ return -ENOMEM;
+
+ p = joined;
+ }
+
+ delegated_mask = unit_get_delegate_mask(u);
+
+ ret = 0;
+ SET_FOREACH(pidp, pids) {
+ pid_t pid = PTR_TO_PID(pidp);
+
+ /* First, attach the PID to the main cgroup hierarchy */
+ r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
+ if (r < 0) {
+ bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r);
+
+ log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, r,
+ "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
+ pid, again ? " directly" : "", empty_to_root(p));
+
+ if (again) {
+ int z;
+
+ /* If we are in a user instance, and we can't move the process ourselves due
+ * to permission problems, let's ask the system instance about it instead.
+ * Since it's more privileged it might be able to move the process across the
+ * leaves of a subtree whose top node is not owned by us. */
+
+ z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
+ if (z < 0)
+ log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid, empty_to_root(p));
+ else {
+ if (ret >= 0)
+ ret++; /* Count successful additions */
+ continue; /* When the bus thing worked via the bus we are fully done for this PID. */
+ }
+ }
+
+ if (ret >= 0)
+ ret = r; /* Remember first error */
+
+ continue;
+ } else if (ret >= 0)
+ ret++; /* Count successful additions */
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ continue;
+
+ /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
+ * innermost realized one */
+
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *realized;
+
+ if (!(u->manager->cgroup_supported & bit))
+ continue;
+
+ /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
+ if (delegated_mask & u->cgroup_realized_mask & bit) {
+ r = cg_attach(cgroup_controller_to_string(c), p, pid);
+ if (r >= 0)
+ continue; /* Success! */
+
+ log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
+ pid, empty_to_root(p), cgroup_controller_to_string(c));
+ }
+
+ /* So this controller is either not delegate or realized, or something else weird happened. In
+ * that case let's attach the PID at least to the closest cgroup up the tree that is
+ * realized. */
+ realized = unit_get_realized_cgroup_path(u, bit);
+ if (!realized)
+ continue; /* Not even realized in the root slice? Then let's not bother */
+
+ r = cg_attach(cgroup_controller_to_string(c), realized, pid);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
+ pid, realized, cgroup_controller_to_string(c));
+ }
+ }
+
+ return ret;
+}
+
+static bool unit_has_mask_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if this unit is fully realized. We check four things:
+ *
+ * 1. Whether the cgroup was created at all
+ * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
+ * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
+ * 4. Whether the invalidation mask is currently zero
+ *
+ * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
+ * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
+ * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
+ * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
+ * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
+ * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
+ * simply don't matter. */
+
+ return u->cgroup_realized &&
+ ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
+ ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
+ u->cgroup_invalidated_mask == 0;
+}
+
+static bool unit_has_mask_disables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be disabled are indeed disabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+ * already removed. */
+
+ return !u->cgroup_realized ||
+ (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+ FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+}
+
+static bool unit_has_mask_enables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be enabled are indeed enabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+ * we want to add is already added. */
+
+ return u->cgroup_realized &&
+ ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+ ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+}
+
+void unit_add_to_cgroup_realize_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_cgroup_realize_queue)
+ return;
+
+ LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+ u->in_cgroup_realize_queue = true;
+}
+
+static void unit_remove_from_cgroup_realize_queue(Unit *u) {
+ assert(u);
+
+ if (!u->in_cgroup_realize_queue)
+ return;
+
+ LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+ u->in_cgroup_realize_queue = false;
+}
+
+/* Controllers can only be enabled breadth-first, from the root of the
+ * hierarchy downwards to the unit in question. */
+static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ /* First go deal with this unit's parent, or we won't be able to enable
+ * any new controllers at this layer. */
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ r = unit_realize_cgroup_now_enable(slice, state);
+ if (r < 0)
+ return r;
+ }
+
+ target_mask = unit_get_target_mask(u);
+ enable_mask = unit_get_enable_mask(u);
+
+ /* We can only enable in this direction, don't try to disable anything.
+ */
+ if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+ return 0;
+
+ new_target_mask = u->cgroup_realized_mask | target_mask;
+ new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+
+ return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
+}
+
+/* Controllers can only be disabled depth-first, from the leaves of the
+ * hierarchy upwards to the unit in question. */
+static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+ Unit *m;
+
+ assert(u);
+
+ if (u->type != UNIT_SLICE)
+ return 0;
+
+ UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ int r;
+
+ /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
+ * holding any controllers open anyway. */
+ if (!m->cgroup_realized)
+ continue;
+
+ /* We must disable those below us first in order to release the controller. */
+ if (m->type == UNIT_SLICE)
+ (void) unit_realize_cgroup_now_disable(m, state);
+
+ target_mask = unit_get_target_mask(m);
+ enable_mask = unit_get_enable_mask(m);
+
+ /* We can only disable in this direction, don't try to enable anything. */
+ if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+ continue;
+
+ new_target_mask = m->cgroup_realized_mask & target_mask;
+ new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+
+ r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+/* Check if necessary controllers and attributes for a unit are in place.
+ *
+ * - If so, do nothing.
+ * - If not, create paths, move processes over, and set attributes.
+ *
+ * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ * a depth-first way. As such the process looks like this:
+ *
+ * Suppose we have a cgroup hierarchy which looks like this:
+ *
+ * root
+ * / \
+ * / \
+ * / \
+ * a b
+ * / \ / \
+ * / \ / \
+ * c d e f
+ * / \ / \ / \ / \
+ * h i j k l m n o
+ *
+ * 1. We want to realise cgroup "d" now.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ * 3. cgroup "k" just started requesting the memory controller.
+ *
+ * To make this work we must do the following in order:
+ *
+ * 1. Disable CPU controller in k, j
+ * 2. Disable CPU controller in d
+ * 3. Enable memory controller in root
+ * 4. Enable memory controller in a
+ * 5. Enable memory controller in d
+ * 6. Enable memory controller in k
+ *
+ * Notice that we need to touch j in one direction, but not the other. We also
+ * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ * wants to disable further. The basic rules are therefore:
+ *
+ * - If you're disabling something, you need to realise all of the cgroups from
+ * your recursive descendants to the root. This starts from the leaves.
+ * - If you're enabling something, you need to realise from the root cgroup
+ * downwards, but you don't need to iterate your recursive descendants.
+ *
+ * Returns 0 on success and < 0 on failure. */
+static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
+ CGroupMask target_mask, enable_mask;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ unit_remove_from_cgroup_realize_queue(u);
+
+ target_mask = unit_get_target_mask(u);
+ enable_mask = unit_get_enable_mask(u);
+
+ if (unit_has_mask_realized(u, target_mask, enable_mask))
+ return 0;
+
+ /* Disable controllers below us, if there are any */
+ r = unit_realize_cgroup_now_disable(u, state);
+ if (r < 0)
+ return r;
+
+ /* Enable controllers above us, if there are any */
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ r = unit_realize_cgroup_now_enable(slice, state);
+ if (r < 0)
+ return r;
+ }
+
+ /* Now actually deal with the cgroup we were trying to realise and set attributes */
+ r = unit_update_cgroup(u, target_mask, enable_mask, state);
+ if (r < 0)
+ return r;
+
+ /* Now, reset the invalidation mask */
+ u->cgroup_invalidated_mask = 0;
+ return 0;
+}
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
+ ManagerState state;
+ unsigned n = 0;
+ Unit *i;
+ int r;
+
+ assert(m);
+
+ state = manager_state(m);
+
+ while ((i = m->cgroup_realize_queue)) {
+ assert(i->in_cgroup_realize_queue);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
+ /* Maybe things changed, and the unit is not actually active anymore? */
+ unit_remove_from_cgroup_realize_queue(i);
+ continue;
+ }
+
+ r = unit_realize_cgroup_now(i, state);
+ if (r < 0)
+ log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
+
+ n++;
+ }
+
+ return n;
+}
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u) {
+ assert(u);
+ assert(u->type == UNIT_SLICE);
+
+ /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
+ * its ancestors.
+ *
+ * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
+ * very weird if two units that own processes reside in the same slice, but one is realized in the
+ * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
+ * not), because that means individual processes need to be scheduled against whole cgroups. Let's
+ * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
+ * controller hierarchies too (if unit requires the controller to be realized).
+ *
+ * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
+ * masks. */
+
+ do {
+ Unit *m;
+
+ /* Children of u likely changed when we're called */
+ u->cgroup_members_mask_valid = false;
+
+ UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+
+ /* No point in doing cgroup application for units without active processes. */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
+ continue;
+
+ /* We only enqueue siblings if they were realized once at least, in the main
+ * hierarchy. */
+ if (!m->cgroup_realized)
+ continue;
+
+ /* If the unit doesn't need any new controllers and has current ones
+ * realized, it doesn't need any changes. */
+ if (unit_has_mask_realized(m,
+ unit_get_target_mask(m),
+ unit_get_enable_mask(m)))
+ continue;
+
+ unit_add_to_cgroup_realize_queue(m);
+ }
+
+ /* Parent comes after children */
+ unit_add_to_cgroup_realize_queue(u);
+
+ u = UNIT_GET_SLICE(u);
+ } while (u);
+}
+
+int unit_realize_cgroup(Unit *u) {
+ Unit *slice;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
+ * parents, but there's more actually: for the weight-based controllers we also need to make sure
+ * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
+ * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
+ * and ancestors and they should be (de)realized too.
+ *
+ * This call will defer work on the siblings and derealized ancestors to the next event loop
+ * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ unit_add_family_to_cgroup_realize_queue(slice);
+
+ /* And realize this one now (and apply the values) */
+ return unit_realize_cgroup_now(u, manager_state(u->manager));
+}
+
+void unit_release_cgroup(Unit *u) {
+ assert(u);
+
+ /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
+ * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
+
+ if (u->cgroup_path) {
+ (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
+ u->cgroup_path = mfree(u->cgroup_path);
+ }
+
+ if (u->cgroup_control_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
+
+ (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
+ u->cgroup_control_inotify_wd = -1;
+ }
+
+ if (u->cgroup_memory_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
+
+ (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
+ u->cgroup_memory_inotify_wd = -1;
+ }
+}
+
+bool unit_maybe_release_cgroup(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return true;
+
+ /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
+ * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
+ * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
+ * up later. */
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
+ else if (r == 1) {
+ unit_release_cgroup(u);
+ return true;
+ }
+
+ return false;
+}
+
+void unit_prune_cgroup(Unit *u) {
+ int r;
+ bool is_root_slice;
+
+ assert(u);
+
+ /* Removes the cgroup, if empty and possible, and stops watching it. */
+
+ if (!u->cgroup_path)
+ return;
+
+ (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
+
+#if BPF_FRAMEWORK
+ (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
+#endif
+
+ is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+
+ r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
+ if (r < 0)
+ /* One reason we could have failed here is, that the cgroup still contains a process.
+ * However, if the cgroup becomes removable at a later time, it might be removed when
+ * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
+ * that the cgroup is still realized the next time it is started. Do not return early
+ * on error, continue cleanup. */
+ log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ if (is_root_slice)
+ return;
+
+ if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
+ return;
+
+ u->cgroup_realized = false;
+ u->cgroup_realized_mask = 0;
+ u->cgroup_enabled_mask = 0;
+
+ u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed);
+}
+
+int unit_search_main_pid(Unit *u, pid_t *ret) {
+ _cleanup_fclose_ FILE *f = NULL;
+ pid_t pid = 0, npid;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (!u->cgroup_path)
+ return -ENXIO;
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
+ if (r < 0)
+ return r;
+
+ while (cg_read_pid(f, &npid) > 0) {
+
+ if (npid == pid)
+ continue;
+
+ if (pid_is_my_child(npid) == 0)
+ continue;
+
+ if (pid != 0)
+ /* Dang, there's more than one daemonized PID
+ in this group, so we don't know what process
+ is the main process. */
+
+ return -ENODATA;
+
+ pid = npid;
+ }
+
+ *ret = pid;
+ return 0;
+}
+
+static int unit_watch_pids_in_path(Unit *u, const char *path) {
+ _cleanup_closedir_ DIR *d = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int ret = 0, r;
+
+ assert(u);
+ assert(path);
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+ if (r < 0)
+ ret = r;
+ else {
+ pid_t pid;
+
+ while ((r = cg_read_pid(f, &pid)) > 0) {
+ r = unit_watch_pid(u, pid, false);
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+ if (r < 0) {
+ if (ret >= 0)
+ ret = r;
+ } else {
+ char *fn;
+
+ while ((r = cg_read_subgroup(d, &fn)) > 0) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(empty_to_root(path), fn);
+ free(fn);
+
+ if (!p)
+ return -ENOMEM;
+
+ r = unit_watch_pids_in_path(u, p);
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+int unit_synthesize_cgroup_empty_event(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
+ * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
+ * get as notification source as soon as we stopped having any useful PIDs to watch for. */
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we have reliable notifications, and don't need this */
+ return 0;
+
+ if (!set_isempty(u->pids))
+ return 0;
+
+ unit_add_to_cgroup_empty_queue(u);
+ return 0;
+}
+
+int unit_watch_all_pids(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Adds all PIDs from our cgroup to the set of PIDs we
+ * watch. This is a fallback logic for cases where we do not
+ * get reliable cgroup empty notifications: we try to use
+ * SIGCHLD as replacement. */
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we can use proper notifications */
+ return 0;
+
+ return unit_watch_pids_in_path(u, u->cgroup_path);
+}
+
+static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(s);
+
+ u = m->cgroup_empty_queue;
+ if (!u)
+ return 0;
+
+ assert(u->in_cgroup_empty_queue);
+ u->in_cgroup_empty_queue = false;
+ LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
+
+ if (m->cgroup_empty_queue) {
+ /* More stuff queued, let's make sure we remain enabled */
+ r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
+ }
+
+ /* Update state based on OOM kills before we notify about cgroup empty event */
+ (void) unit_check_oom(u);
+ (void) unit_check_oomd_kill(u);
+
+ unit_add_to_gc_queue(u);
+
+ if (UNIT_VTABLE(u)->notify_cgroup_empty)
+ UNIT_VTABLE(u)->notify_cgroup_empty(u);
+
+ return 0;
+}
+
+void unit_add_to_cgroup_empty_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Note that there are four different ways how cgroup empty events reach us:
+ *
+ * 1. On the unified hierarchy we get an inotify event on the cgroup
+ *
+ * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
+ *
+ * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
+ *
+ * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
+ * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
+ *
+ * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
+ * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
+ * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
+ * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
+ * case for scope units). */
+
+ if (u->in_cgroup_empty_queue)
+ return;
+
+ /* Let's verify that the cgroup is really empty */
+ if (!u->cgroup_path)
+ return;
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+ return;
+ }
+ if (r == 0)
+ return;
+
+ LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+ u->in_cgroup_empty_queue = true;
+
+ /* Trigger the defer event */
+ r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
+}
+
+static void unit_remove_from_cgroup_empty_queue(Unit *u) {
+ assert(u);
+
+ if (!u->in_cgroup_empty_queue)
+ return;
+
+ LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+ u->in_cgroup_empty_queue = false;
+}
+
+int unit_check_oomd_kill(Unit *u) {
+ _cleanup_free_ char *value = NULL;
+ bool increased;
+ uint64_t n = 0;
+ int r;
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
+ else if (r == 0)
+ return 0;
+
+ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_ooms", &value);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ if (!isempty(value)) {
+ r = safe_atou64(value, &n);
+ if (r < 0)
+ return r;
+ }
+
+ increased = n > u->managed_oom_kill_last;
+ u->managed_oom_kill_last = n;
+
+ if (!increased)
+ return 0;
+
+ n = 0;
+ value = mfree(value);
+ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value);
+ if (r >= 0 && !isempty(value))
+ (void) safe_atou64(value, &n);
+
+ if (n > 0)
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
+ "N_PROCESSES=%" PRIu64, n);
+ else
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
+
+ unit_notify_cgroup_oom(u, /* ManagedOOM= */ true);
+
+ return 1;
+}
+
+int unit_check_oom(Unit *u) {
+ _cleanup_free_ char *oom_kill = NULL;
+ bool increased;
+ uint64_t c;
+ int r;
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
+ if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
+ c = 0;
+ else if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
+ else {
+ r = safe_atou64(oom_kill, &c);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
+ }
+
+ increased = c > u->oom_kill_last;
+ u->oom_kill_last = c;
+
+ if (!increased)
+ return 0;
+
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
+
+ unit_notify_cgroup_oom(u, /* ManagedOOM= */ false);
+
+ return 1;
+}
+
+static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(s);
+
+ u = m->cgroup_oom_queue;
+ if (!u)
+ return 0;
+
+ assert(u->in_cgroup_oom_queue);
+ u->in_cgroup_oom_queue = false;
+ LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
+
+ if (m->cgroup_oom_queue) {
+ /* More stuff queued, let's make sure we remain enabled */
+ r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
+ }
+
+ (void) unit_check_oom(u);
+ return 0;
+}
+
+static void unit_add_to_cgroup_oom_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->in_cgroup_oom_queue)
+ return;
+ if (!u->cgroup_path)
+ return;
+
+ LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+ u->in_cgroup_oom_queue = true;
+
+ /* Trigger the defer event */
+ if (!u->manager->cgroup_oom_event_source) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+ r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create cgroup oom event source: %m");
+ return;
+ }
+
+ r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+ if (r < 0) {
+ log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
+ return;
+ }
+
+ (void) sd_event_source_set_description(s, "cgroup-oom");
+ u->manager->cgroup_oom_event_source = TAKE_PTR(s);
+ }
+
+ r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_error_errno(r, "Failed to enable cgroup oom event source: %m");
+}
+
+static int unit_check_cgroup_events(Unit *u) {
+ char *values[2] = {};
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+ STRV_MAKE("populated", "frozen"), values);
+ if (r < 0)
+ return r;
+
+ /* The cgroup.events notifications can be merged together so act as we saw the given state for the
+ * first time. The functions we call to handle given state are idempotent, which makes them
+ * effectively remember the previous state. */
+ if (values[0]) {
+ if (streq(values[0], "1"))
+ unit_remove_from_cgroup_empty_queue(u);
+ else
+ unit_add_to_cgroup_empty_queue(u);
+ }
+
+ /* Disregard freezer state changes due to operations not initiated by us */
+ if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+ if (streq(values[1], "0"))
+ unit_thawed(u);
+ else
+ unit_frozen(u);
+ }
+
+ free(values[0]);
+ free(values[1]);
+
+ return 0;
+}
+
+static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+ assert(fd >= 0);
+
+ for (;;) {
+ union inotify_event_buffer buffer;
+ ssize_t l;
+
+ l = read(fd, &buffer, sizeof(buffer));
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read control group inotify events: %m");
+ }
+
+ FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
+ Unit *u;
+
+ if (e->wd < 0)
+ /* Queue overflow has no watch descriptor */
+ continue;
+
+ if (e->mask & IN_IGNORED)
+ /* The watch was just removed */
+ continue;
+
+ /* Note that inotify might deliver events for a watch even after it was removed,
+ * because it was queued before the removal. Let's ignore this here safely. */
+
+ u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
+ if (u)
+ unit_check_cgroup_events(u);
+
+ u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
+ if (u)
+ unit_add_to_cgroup_oom_queue(u);
+ }
+ }
+}
+
+static int cg_bpf_mask_supported(CGroupMask *ret) {
+ CGroupMask mask = 0;
+ int r;
+
+ /* BPF-based firewall */
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_FIREWALL;
+
+ /* BPF-based device access control */
+ r = bpf_devices_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_DEVICES;
+
+ /* BPF pinned prog */
+ r = bpf_foreign_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_FOREIGN;
+
+ /* BPF-based bind{4|6} hooks */
+ r = bpf_socket_bind_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+ /* BPF-based cgroup_skb/{egress|ingress} hooks */
+ r = restrict_network_interfaces_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+ *ret = mask;
+ return 0;
+}
+
+int manager_setup_cgroup(Manager *m) {
+ _cleanup_free_ char *path = NULL;
+ const char *scope_path;
+ int r, all_unified;
+ CGroupMask mask;
+ char *e;
+
+ assert(m);
+
+ /* 1. Determine hierarchy */
+ m->cgroup_root = mfree(m->cgroup_root);
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
+ if (r < 0)
+ return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
+
+ /* Chop off the init scope, if we are already located in it */
+ e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+
+ /* LEGACY: Also chop off the system slice if we are in
+ * it. This is to support live upgrades from older systemd
+ * versions where PID 1 was moved there. Also see
+ * cg_get_root_path(). */
+ if (!e && MANAGER_IS_SYSTEM(m)) {
+ e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
+ if (!e)
+ e = endswith(m->cgroup_root, "/system"); /* even more legacy */
+ }
+ if (e)
+ *e = 0;
+
+ /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
+ * easily prepend it everywhere. */
+ delete_trailing_chars(m->cgroup_root, "/");
+
+ /* 2. Show data */
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
+ if (r < 0)
+ return log_error_errno(r, "Cannot find cgroup mount point: %m");
+
+ r = cg_unified();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
+
+ all_unified = cg_all_unified();
+ if (all_unified < 0)
+ return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
+ if (all_unified > 0)
+ log_debug("Unified cgroup hierarchy is located at %s.", path);
+ else {
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
+ if (r > 0)
+ log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
+ else
+ log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
+ }
+
+ /* 3. Allocate cgroup empty defer event source */
+ m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+ r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create cgroup empty event source: %m");
+
+ /* Schedule cgroup empty checks early, but after having processed service notification messages or
+ * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
+ * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
+ r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
+
+ r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
+
+ /* 4. Install notifier inotify object, or agent */
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+
+ /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
+
+ m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+ safe_close(m->cgroup_inotify_fd);
+
+ m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (m->cgroup_inotify_fd < 0)
+ return log_error_errno(errno, "Failed to create control group inotify object: %m");
+
+ r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch control group inotify object: %m");
+
+ /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
+ * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
+ * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
+ r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of inotify event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
+
+ } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
+
+ /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
+ * since it does not generate events when control groups with children run empty. */
+
+ r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
+ if (r < 0)
+ log_warning_errno(r, "Failed to install release agent, ignoring: %m");
+ else if (r > 0)
+ log_debug("Installed release agent.");
+ else if (r == 0)
+ log_debug("Release agent already installed.");
+ }
+
+ /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
+ scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+ if (r >= 0) {
+ /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
+ r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+ if (r < 0)
+ log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
+
+ /* 6. And pin it, so that it cannot be unmounted */
+ safe_close(m->pin_cgroupfs_fd);
+ m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
+ if (m->pin_cgroupfs_fd < 0)
+ return log_error_errno(errno, "Failed to open pin file: %m");
+
+ } else if (!MANAGER_IS_TEST_RUN(m))
+ return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
+
+ /* 7. Always enable hierarchical support if it exists... */
+ if (!all_unified && !MANAGER_IS_TEST_RUN(m))
+ (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
+
+ /* 8. Figure out which controllers are supported */
+ r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+ /* 9. Figure out which bpf-based pseudo-controllers are supported */
+ r = cg_bpf_mask_supported(&mask);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
+ m->cgroup_supported |= mask;
+
+ /* 10. Log which controllers are supported */
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
+ log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
+ yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
+
+ return 0;
+}
+
+void manager_shutdown_cgroup(Manager *m, bool delete) {
+ assert(m);
+
+ /* We can't really delete the group, since we are in it. But
+ * let's trim it. */
+ if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+
+ m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+
+ m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
+ m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
+
+ m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+ m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
+
+ m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
+
+ m->cgroup_root = mfree(m->cgroup_root);
+}
+
+Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
+ char *p;
+ Unit *u;
+
+ assert(m);
+ assert(cgroup);
+
+ u = hashmap_get(m->cgroup_unit, cgroup);
+ if (u)
+ return u;
+
+ p = strdupa_safe(cgroup);
+ for (;;) {
+ char *e;
+
+ e = strrchr(p, '/');
+ if (!e || e == p)
+ return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
+
+ *e = 0;
+
+ u = hashmap_get(m->cgroup_unit, p);
+ if (u)
+ return u;
+ }
+}
+
+Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
+ _cleanup_free_ char *cgroup = NULL;
+
+ assert(m);
+
+ if (!pid_is_valid(pid))
+ return NULL;
+
+ if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
+ return NULL;
+
+ return manager_get_unit_by_cgroup(m, cgroup);
+}
+
+Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
+ Unit *u, **array;
+
+ assert(m);
+
+ /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
+ * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
+ * relevant one as children of the process will be assigned to that one, too, before all else. */
+
+ if (!pid_is_valid(pid))
+ return NULL;
+
+ if (pid == getpid_cached())
+ return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
+
+ u = manager_get_unit_by_pid_cgroup(m, pid);
+ if (u)
+ return u;
+
+ u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
+ if (u)
+ return u;
+
+ array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
+ if (array)
+ return array[0];
+
+ return NULL;
+}
+
+int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
+ Unit *u;
+
+ assert(m);
+ assert(cgroup);
+
+ /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
+ * or from the --system instance */
+
+ log_debug("Got cgroup empty notification for: %s", cgroup);
+
+ u = manager_get_unit_by_cgroup(m, cgroup);
+ if (!u)
+ return 0;
+
+ unit_add_to_cgroup_empty_queue(u);
+ return 1;
+}
+
+int unit_get_memory_available(Unit *u, uint64_t *ret) {
+ uint64_t unit_current, available = UINT64_MAX;
+ CGroupContext *unit_context;
+ const char *memory_file;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ /* If data from cgroups can be accessed, try to find out how much more memory a unit can
+ * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
+ * and MemoryMax, and also any slice the unit might be nested below. */
+
+ if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information */
+ if (unit_has_host_root_cgroup(u))
+ return -ENODATA;
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ memory_file = r > 0 ? "memory.current" : "memory.usage_in_bytes";
+
+ r = cg_get_attribute_as_uint64("memory", u->cgroup_path, memory_file, &unit_current);
+ if (r < 0)
+ return r;
+
+ assert_se(unit_context = unit_get_cgroup_context(u));
+
+ if (unit_context->memory_max != UINT64_MAX || unit_context->memory_high != UINT64_MAX)
+ available = LESS_BY(MIN(unit_context->memory_max, unit_context->memory_high), unit_current);
+
+ for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice)) {
+ uint64_t slice_current, slice_available = UINT64_MAX;
+ CGroupContext *slice_context;
+
+ /* No point in continuing if we can't go any lower */
+ if (available == 0)
+ break;
+
+ if (!slice->cgroup_path)
+ continue;
+
+ slice_context = unit_get_cgroup_context(slice);
+ if (!slice_context)
+ continue;
+
+ if (slice_context->memory_max == UINT64_MAX && slice_context->memory_high == UINT64_MAX)
+ continue;
+
+ r = cg_get_attribute_as_uint64("memory", slice->cgroup_path, memory_file, &slice_current);
+ if (r < 0)
+ continue;
+
+ slice_available = LESS_BY(MIN(slice_context->memory_max, slice_context->memory_high), slice_current);
+ available = MIN(slice_available, available);
+ }
+
+ *ret = available;
+
+ return 0;
+}
+
+int unit_get_memory_current(Unit *u, uint64_t *ret) {
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_memory_get_used(ret);
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+
+ return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
+}
+
+int unit_get_tasks_current(Unit *u, uint64_t *ret) {
+ assert(u);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_tasks_get_current(ret);
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
+ return -ENODATA;
+
+ return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
+}
+
+static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
+ uint64_t ns;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_cpu_get_usage(ret);
+
+ /* Requisite controllers for CPU accounting are not enabled */
+ if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ _cleanup_free_ char *val = NULL;
+ uint64_t us;
+
+ r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
+ if (IN_SET(r, -ENOENT, -ENXIO))
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ r = safe_atou64(val, &us);
+ if (r < 0)
+ return r;
+
+ ns = us * NSEC_PER_USEC;
+ } else
+ return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
+
+ *ret = ns;
+ return 0;
+}
+
+int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
+ nsec_t ns;
+ int r;
+
+ assert(u);
+
+ /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
+ * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
+ * call this function with a NULL return value. */
+
+ if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
+ return -ENODATA;
+
+ r = unit_get_cpu_usage_raw(u, &ns);
+ if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
+ /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
+ * cached value. */
+
+ if (ret)
+ *ret = u->cpu_usage_last;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ if (ns > u->cpu_usage_base)
+ ns -= u->cpu_usage_base;
+ else
+ ns = 0;
+
+ u->cpu_usage_last = ns;
+ if (ret)
+ *ret = ns;
+
+ return 0;
+}
+
+int unit_get_ip_accounting(
+ Unit *u,
+ CGroupIPAccountingMetric metric,
+ uint64_t *ret) {
+
+ uint64_t value;
+ int fd, r;
+
+ assert(u);
+ assert(metric >= 0);
+ assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, ip_accounting))
+ return -ENODATA;
+
+ fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
+ u->ip_accounting_ingress_map_fd :
+ u->ip_accounting_egress_map_fd;
+ if (fd < 0)
+ return -ENODATA;
+
+ if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+ r = bpf_firewall_read_accounting(fd, &value, NULL);
+ else
+ r = bpf_firewall_read_accounting(fd, NULL, &value);
+ if (r < 0)
+ return r;
+
+ /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
+ * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
+ * ip_accounting_extra[] field, and add them in here transparently. */
+
+ *ret = value + u->ip_accounting_extra[metric];
+
+ return r;
+}
+
+static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
+ static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "rbytes=",
+ [CGROUP_IO_WRITE_BYTES] = "wbytes=",
+ [CGROUP_IO_READ_OPERATIONS] = "rios=",
+ [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
+ };
+ uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
+ _cleanup_free_ char *path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ if (unit_has_host_root_cgroup(u))
+ return -ENODATA; /* TODO: return useful data for the top-level cgroup */
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0) /* TODO: support cgroupv1 */
+ return -ENODATA;
+
+ if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
+ return -ENODATA;
+
+ r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
+ if (r < 0)
+ return r;
+
+ f = fopen(path, "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ const char *p;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = line;
+ p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
+ p += strspn(p, WHITESPACE); /* Skip over following whitespace */
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+ const char *x;
+
+ x = startswith(word, field_names[i]);
+ if (x) {
+ uint64_t w;
+
+ r = safe_atou64(x, &w);
+ if (r < 0)
+ return r;
+
+ /* Sum up the stats of all devices */
+ acc[i] += w;
+ break;
+ }
+ }
+ }
+ }
+
+ memcpy(ret, acc, sizeof(acc));
+ return 0;
+}
+
+int unit_get_io_accounting(
+ Unit *u,
+ CGroupIOAccountingMetric metric,
+ bool allow_cache,
+ uint64_t *ret) {
+
+ uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+ int r;
+
+ /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
+
+ if (!UNIT_CGROUP_BOOL(u, io_accounting))
+ return -ENODATA;
+
+ if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
+ goto done;
+
+ r = unit_get_io_accounting_raw(u, raw);
+ if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
+ goto done;
+ if (r < 0)
+ return r;
+
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+ /* Saturated subtraction */
+ if (raw[i] > u->io_accounting_base[i])
+ u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
+ else
+ u->io_accounting_last[i] = 0;
+ }
+
+done:
+ if (ret)
+ *ret = u->io_accounting_last[metric];
+
+ return 0;
+}
+
+int unit_reset_cpu_accounting(Unit *u) {
+ int r;
+
+ assert(u);
+
+ u->cpu_usage_last = NSEC_INFINITY;
+
+ r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
+ if (r < 0) {
+ u->cpu_usage_base = 0;
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_reset_ip_accounting(Unit *u) {
+ int r = 0, q = 0;
+
+ assert(u);
+
+ if (u->ip_accounting_ingress_map_fd >= 0)
+ r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
+
+ if (u->ip_accounting_egress_map_fd >= 0)
+ q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
+
+ zero(u->ip_accounting_extra);
+
+ return r < 0 ? r : q;
+}
+
+int unit_reset_io_accounting(Unit *u) {
+ int r;
+
+ assert(u);
+
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
+ u->io_accounting_last[i] = UINT64_MAX;
+
+ r = unit_get_io_accounting_raw(u, u->io_accounting_base);
+ if (r < 0) {
+ zero(u->io_accounting_base);
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_reset_accounting(Unit *u) {
+ int r, q, v;
+
+ assert(u);
+
+ r = unit_reset_cpu_accounting(u);
+ q = unit_reset_io_accounting(u);
+ v = unit_reset_ip_accounting(u);
+
+ return r < 0 ? r : q < 0 ? q : v;
+}
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ if (m == 0)
+ return;
+
+ /* always invalidate compat pairs together */
+ if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
+ m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+ if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
+ m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
+
+ if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
+ return;
+
+ u->cgroup_invalidated_mask |= m;
+ unit_add_to_cgroup_realize_queue(u);
+}
+
+void unit_invalidate_cgroup_bpf(Unit *u) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
+ return;
+
+ u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+ unit_add_to_cgroup_realize_queue(u);
+
+ /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
+ * list of our children includes our own. */
+ if (u->type == UNIT_SLICE) {
+ Unit *member;
+
+ UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+ unit_invalidate_cgroup_bpf(member);
+ }
+}
+
+void unit_cgroup_catchup(Unit *u) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ /* We dropped the inotify watch during reexec/reload, so we need to
+ * check these as they may have changed.
+ * Note that (currently) the kernel doesn't actually update cgroup
+ * file modification times, so we can't just serialize and then check
+ * the mtime for file(s) we are interested in. */
+ (void) unit_check_cgroup_events(u);
+ unit_add_to_cgroup_oom_queue(u);
+}
+
+bool unit_cgroup_delegate(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return false;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->delegate;
+}
+
+void manager_invalidate_startup_units(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ SET_FOREACH(u, m->startup_units)
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
+}
+
+static int unit_get_nice(Unit *u) {
+ ExecContext *ec;
+
+ ec = unit_get_exec_context(u);
+ return ec ? ec->nice : 0;
+}
+
+static uint64_t unit_get_cpu_weight(Unit *u) {
+ ManagerState state = manager_state(u->manager);
+ CGroupContext *cc;
+
+ cc = unit_get_cgroup_context(u);
+ return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
+}
+
+int compare_job_priority(const void *a, const void *b) {
+ const Job *x = a, *y = b;
+ int nice_x, nice_y;
+ uint64_t weight_x, weight_y;
+ int ret;
+
+ if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
+ return -ret;
+
+ weight_x = unit_get_cpu_weight(x->unit);
+ weight_y = unit_get_cpu_weight(y->unit);
+
+ if ((ret = CMP(weight_x, weight_y)) != 0)
+ return -ret;
+
+ nice_x = unit_get_nice(x->unit);
+ nice_y = unit_get_nice(y->unit);
+
+ if ((ret = CMP(nice_x, nice_y)) != 0)
+ return ret;
+
+ return strcmp(x->unit->id, y->unit->id);
+}
+
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
+ _cleanup_free_ char *path = NULL;
+ FreezerState target, kernel = _FREEZER_STATE_INVALID;
+ int r, ret;
+
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (!cg_freezer_supported())
+ return 0;
+
+ /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return action == FREEZER_FREEZE ? -EPERM : 0;
+
+ if (!u->cgroup_realized)
+ return -EBUSY;
+
+ if (action == FREEZER_THAW) {
+ Unit *slice = UNIT_GET_SLICE(u);
+
+ if (slice) {
+ r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
+ }
+ }
+
+ target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
+
+ r = unit_freezer_state_kernel(u, &kernel);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+
+ if (target == kernel) {
+ u->freezer_state = target;
+ if (action == FREEZER_FREEZE)
+ return 0;
+ ret = 0;
+ } else
+ ret = 1;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
+ if (r < 0)
+ return r;
+
+ log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
+
+ if (target != kernel) {
+ if (action == FREEZER_FREEZE)
+ u->freezer_state = FREEZER_FREEZING;
+ else
+ u->freezer_state = FREEZER_THAWING;
+ }
+
+ r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ return r;
+
+ return ret;
+}
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
+ _cleanup_free_ char *v = NULL;
+ int r;
+
+ assert(u);
+ assert(cpus);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENODATA;
+
+ r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
+ if (r == -ENOENT)
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
+}
+
+static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
+ [CGROUP_DEVICE_POLICY_AUTO] = "auto",
+ [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
+ [CGROUP_DEVICE_POLICY_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
+
+static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
+ [FREEZER_FREEZE] = "freeze",
+ [FREEZER_THAW] = "thaw",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
new file mode 100644
index 0000000..8c53048
--- /dev/null
+++ b/src/core/cgroup.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "cpu-set-util.h"
+#include "list.h"
+#include "time-util.h"
+
+typedef struct TasksMax {
+ /* If scale == 0, just use value; otherwise, value / scale.
+ * See tasks_max_resolve(). */
+ uint64_t value;
+ uint64_t scale;
+} TasksMax;
+
+#define TASKS_MAX_UNSET ((TasksMax) { .value = UINT64_MAX, .scale = 0 })
+
+static inline bool tasks_max_isset(const TasksMax *tasks_max) {
+ return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
+}
+
+uint64_t tasks_max_resolve(const TasksMax *tasks_max);
+
+typedef struct CGroupContext CGroupContext;
+typedef struct CGroupDeviceAllow CGroupDeviceAllow;
+typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
+typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
+typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
+typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
+typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
+typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
+typedef struct CGroupSocketBindItem CGroupSocketBindItem;
+
+typedef enum CGroupDevicePolicy {
+ /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
+ * everything. */
+ CGROUP_DEVICE_POLICY_AUTO,
+
+ /* Everything forbidden, except built-in ones and listed ones. */
+ CGROUP_DEVICE_POLICY_CLOSED,
+
+ /* Everything forbidden, except for the listed devices */
+ CGROUP_DEVICE_POLICY_STRICT,
+
+ _CGROUP_DEVICE_POLICY_MAX,
+ _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
+} CGroupDevicePolicy;
+
+typedef enum FreezerAction {
+ FREEZER_FREEZE,
+ FREEZER_THAW,
+
+ _FREEZER_ACTION_MAX,
+ _FREEZER_ACTION_INVALID = -EINVAL,
+} FreezerAction;
+
+struct CGroupDeviceAllow {
+ LIST_FIELDS(CGroupDeviceAllow, device_allow);
+ char *path;
+ bool r:1;
+ bool w:1;
+ bool m:1;
+};
+
+struct CGroupIODeviceWeight {
+ LIST_FIELDS(CGroupIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupIODeviceLimit {
+ LIST_FIELDS(CGroupIODeviceLimit, device_limits);
+ char *path;
+ uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+};
+
+struct CGroupIODeviceLatency {
+ LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
+ char *path;
+ usec_t target_usec;
+};
+
+struct CGroupBlockIODeviceWeight {
+ LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupBlockIODeviceBandwidth {
+ LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
+ char *path;
+ uint64_t rbps;
+ uint64_t wbps;
+};
+
+struct CGroupBPFForeignProgram {
+ LIST_FIELDS(CGroupBPFForeignProgram, programs);
+ uint32_t attach_type;
+ char *bpffs_path;
+};
+
+struct CGroupSocketBindItem {
+ LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
+ int address_family;
+ int ip_protocol;
+ uint16_t nr_ports;
+ uint16_t port_min;
+};
+
+struct CGroupContext {
+ bool cpu_accounting;
+ bool io_accounting;
+ bool blockio_accounting;
+ bool memory_accounting;
+ bool tasks_accounting;
+ bool ip_accounting;
+
+ /* Configures the memory.oom.group attribute (on unified) */
+ bool memory_oom_group;
+
+ bool delegate;
+ CGroupMask delegate_controllers;
+ CGroupMask disable_controllers;
+
+ /* For unified hierarchy */
+ uint64_t cpu_weight;
+ uint64_t startup_cpu_weight;
+ usec_t cpu_quota_per_sec_usec;
+ usec_t cpu_quota_period_usec;
+
+ CPUSet cpuset_cpus;
+ CPUSet startup_cpuset_cpus;
+ CPUSet cpuset_mems;
+ CPUSet startup_cpuset_mems;
+
+ uint64_t io_weight;
+ uint64_t startup_io_weight;
+ LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
+ LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
+ LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
+
+ uint64_t default_memory_min;
+ uint64_t default_memory_low;
+ uint64_t memory_min;
+ uint64_t memory_low;
+ uint64_t memory_high;
+ uint64_t memory_max;
+ uint64_t memory_swap_max;
+
+ bool default_memory_min_set:1;
+ bool default_memory_low_set:1;
+ bool memory_min_set:1;
+ bool memory_low_set:1;
+
+ Set *ip_address_allow;
+ Set *ip_address_deny;
+ /* These two flags indicate that redundant entries have been removed from
+ * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
+ bool ip_address_allow_reduced;
+ bool ip_address_deny_reduced;
+
+ char **ip_filters_ingress;
+ char **ip_filters_egress;
+ LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
+
+ Set *restrict_network_interfaces;
+ bool restrict_network_interfaces_is_allow_list;
+
+ /* For legacy hierarchies */
+ uint64_t cpu_shares;
+ uint64_t startup_cpu_shares;
+
+ uint64_t blockio_weight;
+ uint64_t startup_blockio_weight;
+ LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
+ LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
+
+ uint64_t memory_limit;
+
+ CGroupDevicePolicy device_policy;
+ LIST_HEAD(CGroupDeviceAllow, device_allow);
+
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
+
+ /* Common */
+ TasksMax tasks_max;
+
+ /* Settings for systemd-oomd */
+ ManagedOOMMode moom_swap;
+ ManagedOOMMode moom_mem_pressure;
+ uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
+ ManagedOOMPreference moom_preference;
+};
+
+/* Used when querying IP accounting data */
+typedef enum CGroupIPAccountingMetric {
+ CGROUP_IP_INGRESS_BYTES,
+ CGROUP_IP_INGRESS_PACKETS,
+ CGROUP_IP_EGRESS_BYTES,
+ CGROUP_IP_EGRESS_PACKETS,
+ _CGROUP_IP_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIPAccountingMetric;
+
+/* Used when querying IO accounting data */
+typedef enum CGroupIOAccountingMetric {
+ CGROUP_IO_READ_BYTES,
+ CGROUP_IO_WRITE_BYTES,
+ CGROUP_IO_READ_OPERATIONS,
+ CGROUP_IO_WRITE_OPERATIONS,
+ _CGROUP_IO_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIOAccountingMetric;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
+
+void cgroup_context_init(CGroupContext *c);
+void cgroup_context_done(CGroupContext *c);
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
+
+int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
+int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
+
+void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path);
+
+CGroupMask unit_get_own_mask(Unit *u);
+CGroupMask unit_get_delegate_mask(Unit *u);
+CGroupMask unit_get_members_mask(Unit *u);
+CGroupMask unit_get_siblings_mask(Unit *u);
+CGroupMask unit_get_ancestor_disable_mask(Unit *u);
+
+CGroupMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_enable_mask(Unit *u);
+
+void unit_invalidate_cgroup_members_masks(Unit *u);
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u);
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
+char *unit_default_cgroup_path(const Unit *u);
+int unit_set_cgroup_path(Unit *u, const char *path);
+int unit_pick_cgroup_path(Unit *u);
+
+int unit_realize_cgroup(Unit *u);
+void unit_prune_cgroup(Unit *u);
+int unit_watch_cgroup(Unit *u);
+int unit_watch_cgroup_memory(Unit *u);
+void unit_add_to_cgroup_realize_queue(Unit *u);
+
+void unit_release_cgroup(Unit *u);
+/* Releases the cgroup only if it is recursively empty.
+ * Returns true if the cgroup was released, false otherwise. */
+bool unit_maybe_release_cgroup(Unit *u);
+
+void unit_add_to_cgroup_empty_queue(Unit *u);
+int unit_check_oomd_kill(Unit *u);
+int unit_check_oom(Unit *u);
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+
+int manager_setup_cgroup(Manager *m);
+void manager_shutdown_cgroup(Manager *m, bool delete);
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
+
+Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
+Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid);
+Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
+
+uint64_t unit_get_ancestor_memory_min(Unit *u);
+uint64_t unit_get_ancestor_memory_low(Unit *u);
+
+int unit_search_main_pid(Unit *u, pid_t *ret);
+int unit_watch_all_pids(Unit *u);
+
+int unit_synthesize_cgroup_empty_event(Unit *u);
+
+int unit_get_memory_current(Unit *u, uint64_t *ret);
+int unit_get_memory_available(Unit *u, uint64_t *ret);
+int unit_get_tasks_current(Unit *u, uint64_t *ret);
+int unit_get_cpu_usage(Unit *u, nsec_t *ret);
+int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
+int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
+
+int unit_reset_cpu_accounting(Unit *u);
+int unit_reset_ip_accounting(Unit *u);
+int unit_reset_io_accounting(Unit *u);
+int unit_reset_accounting(Unit *u);
+
+#define UNIT_CGROUP_BOOL(u, name) \
+ ({ \
+ CGroupContext *cc = unit_get_cgroup_context(u); \
+ cc ? cc->name : false; \
+ })
+
+bool manager_owns_host_root_cgroup(Manager *m);
+bool unit_has_host_root_cgroup(Unit *u);
+
+bool unit_has_startup_cgroup_constraints(Unit *u);
+
+int manager_notify_cgroup_empty(Manager *m, const char *group);
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m);
+void unit_invalidate_cgroup_bpf(Unit *u);
+
+void manager_invalidate_startup_units(Manager *m);
+
+const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
+CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
+
+void unit_cgroup_catchup(Unit *u);
+
+bool unit_cgroup_delegate(Unit *u);
+
+int compare_job_priority(const void *a, const void *b);
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
+
+const char* freezer_action_to_string(FreezerAction a) _const_;
+FreezerAction freezer_action_from_string(const char *s) _pure_;
diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c
new file mode 100644
index 0000000..8432715
--- /dev/null
+++ b/src/core/core-varlink.c
@@ -0,0 +1,630 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "core-varlink.h"
+#include "mkdir-label.h"
+#include "strv.h"
+#include "user-util.h"
+#include "varlink.h"
+
+typedef struct LookupParameters {
+ const char *user_name;
+ const char *group_name;
+ union {
+ uid_t uid;
+ gid_t gid;
+ };
+ const char *service;
+} LookupParameters;
+
+static const char* const managed_oom_mode_properties[] = {
+ "ManagedOOMSwap",
+ "ManagedOOMMemoryPressure",
+};
+
+static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) {
+ assert(user_name);
+ assert(uid_is_valid(uid));
+ assert(ret);
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(uid)),
+ JSON_BUILD_PAIR("realName", JSON_BUILD_CONST_STRING("Dynamic User")),
+ JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")),
+ JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)),
+ JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+}
+
+static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) {
+ assert(p);
+
+ if (p->user_name && !streq(name, p->user_name))
+ return false;
+
+ if (uid_is_valid(p->uid) && uid != p->uid)
+ return false;
+
+ return true;
+}
+
+static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) {
+ bool use_limit = false;
+ CGroupContext *c;
+ const char *mode;
+
+ assert(u);
+ assert(property);
+ assert(ret_v);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return -EOPNOTSUPP;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return -EINVAL;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ /* systemd-oomd should always treat inactive units as though they didn't enable any action since they
+ * should not have a valid cgroup */
+ mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO);
+ else if (streq(property, "ManagedOOMSwap"))
+ mode = managed_oom_mode_to_string(c->moom_swap);
+ else if (streq(property, "ManagedOOMMemoryPressure")) {
+ mode = managed_oom_mode_to_string(c->moom_mem_pressure);
+ use_limit = true;
+ } else
+ return -EINVAL;
+
+ return json_build(ret_v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)),
+ JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)),
+ JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)),
+ JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))));
+}
+
+int manager_varlink_send_managed_oom_update(Unit *u) {
+ _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL;
+ CGroupContext *c;
+ int r;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(u->manager)) {
+ /* In system mode we can't send any notifications unless oomd connected back to us. In this
+ * mode oomd must initiate communication, not us. */
+ if (!u->manager->managed_oom_varlink)
+ return 0;
+ } else {
+ /* If we are in user mode, let's connect to oomd if we aren't connected yet. In this mode we
+ * must initiate communication to oomd, not the other way round. */
+ r = manager_varlink_init(u->manager);
+ if (r <= 0)
+ return r;
+ }
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ /* in system mode, oomd is our client, thus send out notifications as replies to the
+ * initiating method call from them. */
+ r = varlink_notify(u->manager->managed_oom_varlink, v);
+ else
+ /* in user mode, we are oomd's client, thus send out notifications as method calls that do
+ * not expect a reply. */
+ r = varlink_send(u->manager->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+
+ return r;
+}
+
+static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) {
+ static const UnitType supported_unit_types[] = { UNIT_SLICE, UNIT_SERVICE, UNIT_SCOPE };
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < ELEMENTSOF(supported_unit_types); i++)
+ LIST_FOREACH(units_by_type, u, m->units_by_type[supported_unit_types[i]]) {
+ CGroupContext *c;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ continue;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ continue;
+
+ for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ /* For the initial varlink call we only care about units that enabled (i.e. mode is not
+ * set to "auto") oomd properties. */
+ if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
+ !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+ continue;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+static int vl_method_subscribe_managed_oom_cgroups(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ pid_t pid;
+ Unit *u;
+ int r;
+
+ assert(link);
+
+ r = varlink_get_peer_pid(link, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ /* This is meant to be a deterrent and not actual security. The alternative is to check for the systemd-oom
+ * user that this unit runs as, but NSS lookups are blocking and not allowed from PID 1. */
+ if (!streq(u->id, "systemd-oomd.service"))
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ if (json_variant_elements(parameters) > 0)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ /* We only take one subscriber for this method so return an error if there's already an existing one.
+ * This shouldn't happen since systemd-oomd is the only client of this method. */
+ if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink)
+ return varlink_error(link, VARLINK_ERROR_SUBSCRIPTION_TAKEN, NULL);
+
+ r = build_managed_oom_cgroups_json(m, &v);
+ if (r < 0)
+ return r;
+
+ if (!FLAGS_SET(flags, VARLINK_METHOD_MORE))
+ return varlink_reply(link, v);
+
+ assert(!m->managed_oom_varlink);
+ m->managed_oom_varlink = varlink_ref(link);
+ return varlink_notify(m->managed_oom_varlink, v);
+}
+
+static int manager_varlink_send_managed_oom_initial(Manager *m) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ assert(m->managed_oom_varlink);
+
+ r = build_managed_oom_cgroups_json(m, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_send(m->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+}
+
+static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 },
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .uid = UID_INVALID,
+ };
+ _cleanup_free_ char *found_name = NULL;
+ uid_t found_uid = UID_INVALID, uid;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *un;
+ int r;
+
+ assert(parameters);
+
+ r = json_dispatch(parameters, dispatch_table, NULL, 0, &p);
+ if (r < 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (uid_is_valid(p.uid))
+ r = dynamic_user_lookup_uid(m, p.uid, &found_name);
+ else if (p.user_name)
+ r = dynamic_user_lookup_name(m, p.user_name, &found_uid);
+ else {
+ DynamicUser *d;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN) /* not realized yet? */
+ continue;
+ if (r < 0)
+ return r;
+
+ if (!user_match_lookup_parameters(&p, d->name, uid))
+ continue;
+
+ if (v) {
+ r = varlink_notify(link, v);
+ if (r < 0)
+ return r;
+
+ v = json_variant_unref(v);
+ }
+
+ r = build_user_json(d->name, uid, &v);
+ if (r < 0)
+ return r;
+ }
+
+ if (!v)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+ return varlink_reply(link, v);
+ }
+ if (r == -ESRCH)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+ if (r < 0)
+ return r;
+
+ uid = uid_is_valid(found_uid) ? found_uid : p.uid;
+ un = found_name ?: p.user_name;
+
+ if (!user_match_lookup_parameters(&p, un, uid))
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ r = build_user_json(un, uid, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_reply(link, v);
+}
+
+static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret) {
+ assert(group_name);
+ assert(gid_is_valid(gid));
+ assert(ret);
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)),
+ JSON_BUILD_PAIR("description", JSON_BUILD_CONST_STRING("Dynamic Group")),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+ }
+
+static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) {
+ assert(p);
+
+ if (p->group_name && !streq(name, p->group_name))
+ return false;
+
+ if (gid_is_valid(p->gid) && gid != p->gid)
+ return false;
+
+ return true;
+}
+
+static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .gid = GID_INVALID,
+ };
+ _cleanup_free_ char *found_name = NULL;
+ uid_t found_gid = GID_INVALID, gid;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *gn;
+ int r;
+
+ assert(parameters);
+
+ r = json_dispatch(parameters, dispatch_table, NULL, 0, &p);
+ if (r < 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (gid_is_valid(p.gid))
+ r = dynamic_user_lookup_uid(m, (uid_t) p.gid, &found_name);
+ else if (p.group_name)
+ r = dynamic_user_lookup_name(m, p.group_name, (uid_t*) &found_gid);
+ else {
+ DynamicUser *d;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ uid_t uid;
+
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN)
+ continue;
+ if (r < 0)
+ return r;
+
+ if (!group_match_lookup_parameters(&p, d->name, (gid_t) uid))
+ continue;
+
+ if (v) {
+ r = varlink_notify(link, v);
+ if (r < 0)
+ return r;
+
+ v = json_variant_unref(v);
+ }
+
+ r = build_group_json(d->name, (gid_t) uid, &v);
+ if (r < 0)
+ return r;
+ }
+
+ if (!v)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+ return varlink_reply(link, v);
+ }
+ if (r == -ESRCH)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+ if (r < 0)
+ return r;
+
+ gid = gid_is_valid(found_gid) ? found_gid : p.gid;
+ gn = found_name ?: p.group_name;
+
+ if (!group_match_lookup_parameters(&p, gn, gid))
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ r = build_group_json(gn, gid, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_reply(link, v);
+}
+
+static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ LookupParameters p = {};
+ int r;
+
+ assert(parameters);
+
+ r = json_dispatch(parameters, dispatch_table, NULL, 0, &p);
+ if (r < 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ /* We don't support auxiliary groups with dynamic users. */
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+ assert(link);
+
+ if (link == m->managed_oom_varlink)
+ m->managed_oom_varlink = varlink_unref(link);
+}
+
+static int manager_varlink_init_system(Manager *m) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->varlink_server)
+ return 1;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ r = manager_setup_varlink_server(m, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up varlink server: %m");
+
+ if (!MANAGER_IS_TEST_RUN(m)) {
+ (void) mkdir_p_label("/run/systemd/userdb", 0755);
+
+ r = varlink_server_listen_address(s, "/run/systemd/userdb/io.systemd.DynamicUser", 0666);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind to varlink socket: %m");
+
+ r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM, 0666);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind to varlink socket: %m");
+ }
+
+ r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ m->varlink_server = TAKE_PTR(s);
+ return 1;
+}
+
+static int vl_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ if (error_id)
+ log_debug("varlink systemd-oomd client error: %s", error_id);
+
+ if (FLAGS_SET(flags, VARLINK_REPLY_ERROR) && FLAGS_SET(flags, VARLINK_REPLY_LOCAL)) {
+ /* Varlink connection was closed, likely because of systemd-oomd restart. Let's try to
+ * reconnect and send the initial ManagedOOM update again. */
+
+ m->managed_oom_varlink = varlink_unref(link);
+
+ log_debug("Reconnecting to %s", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+
+ r = manager_varlink_init(m);
+ if (r <= 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int manager_varlink_init_user(Manager *m) {
+ _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->managed_oom_varlink)
+ return 1;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+ if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) {
+ log_debug("systemd-oomd varlink unix socket not found, skipping user manager varlink setup");
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+
+ varlink_set_userdata(link, m);
+
+ r = varlink_bind_reply(link, vl_reply);
+ if (r < 0)
+ return r;
+
+ r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ m->managed_oom_varlink = TAKE_PTR(link);
+
+ /* Queue the initial ManagedOOM update. */
+ (void) manager_varlink_send_managed_oom_initial(m);
+
+ return 1;
+}
+
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate varlink server object: %m");
+
+ varlink_server_set_userdata(s, m);
+
+ r = varlink_server_bind_method_many(
+ s,
+ "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
+ "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+ "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
+ "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink methods: %m");
+
+ r = varlink_server_bind_disconnect(s, vl_disconnect);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink disconnect handler: %m");
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+int manager_varlink_init(Manager *m) {
+ return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m);
+}
+
+void manager_varlink_done(Manager *m) {
+ assert(m);
+
+ /* Explicitly close the varlink connection to oomd. Note we first take the varlink connection out of
+ * the manager, and only then disconnect it — in two steps – so that we don't end up accidentally
+ * unreffing it twice. After all, closing the connection might cause the disconnect handler we
+ * installed (vl_disconnect() above) to be called, where we will unref it too. */
+ varlink_close_unref(TAKE_PTR(m->managed_oom_varlink));
+
+ m->varlink_server = varlink_server_unref(m->varlink_server);
+ m->managed_oom_varlink = varlink_close_unref(m->managed_oom_varlink);
+}
diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h
new file mode 100644
index 0000000..7f810d1
--- /dev/null
+++ b/src/core/core-varlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "manager.h"
+
+int manager_varlink_init(Manager *m);
+void manager_varlink_done(Manager *m);
+
+/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events.
+ * Used for manager serialize/deserialize. */
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s);
+
+/* The manager is expected to send an update to systemd-oomd if one of the following occurs:
+ * - The value of ManagedOOM*= properties change
+ * - A unit with ManagedOOM*= properties changes unit active state */
+int manager_varlink_send_managed_oom_update(Unit *u);
diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c
new file mode 100644
index 0000000..561b7fc
--- /dev/null
+++ b/src/core/crash-handler.c
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/reboot.h>
+
+#include "crash-handler.h"
+#include "exit-status.h"
+#include "macro.h"
+#include "main.h"
+#include "missing_syscall.h"
+#include "process-util.h"
+#include "raw-clone.h"
+#include "rlimit-util.h"
+#include "signal-util.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void) {
+
+ /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
+ * the container manager, and thus inform it that something went wrong. */
+ if (detect_container() > 0) {
+ log_emergency("Exiting PID 1...");
+ _exit(EXIT_EXCEPTION);
+ }
+
+ if (arg_crash_reboot) {
+ log_notice("Rebooting in 10s...");
+ (void) sleep(10);
+
+ log_notice("Rebooting now...");
+ (void) reboot(RB_AUTOBOOT);
+ log_emergency_errno(errno, "Failed to reboot: %m");
+ }
+
+ log_emergency("Freezing execution.");
+ sync();
+ freeze();
+}
+
+_noreturn_ static void crash(int sig, siginfo_t *siginfo, void *context) {
+ struct sigaction sa;
+ pid_t pid;
+
+ /* NB: 💣 💣 💣 This is a signal handler, most likely executed in a situation where we have corrupted
+ * memory. Thus: please avoid any libc memory allocation here, or any functions that internally use
+ * memory allocation, as we cannot rely on memory allocation still working at this point! (Note that
+ * memory allocation is not async-signal-safe anyway — see signal-safety(7) for details —, and thus
+ * is not permissible in signal handlers.) */
+
+ if (getpid_cached() != 1)
+ /* Pass this on immediately, if this is not PID 1 */
+ (void) raise(sig);
+ else if (!arg_dump_core)
+ log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig));
+ else {
+ sa = (struct sigaction) {
+ .sa_handler = nop_signal_handler,
+ .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+ };
+
+ /* We want to wait for the core process, hence let's enable SIGCHLD */
+ (void) sigaction(SIGCHLD, &sa, NULL);
+
+ pid = raw_clone(SIGCHLD);
+ if (pid < 0)
+ log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig));
+ else if (pid == 0) {
+ /* Enable default signal handler for core dump */
+
+ sa = (struct sigaction) {
+ .sa_handler = SIG_DFL,
+ };
+ (void) sigaction(sig, &sa, NULL);
+
+ /* Don't limit the coredump size */
+ (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
+
+ /* Just to be sure... */
+ (void) chdir("/");
+
+ /* Raise the signal again */
+ pid = raw_getpid();
+ (void) kill(pid, sig); /* raise() would kill the parent */
+
+ assert_not_reached();
+ _exit(EXIT_EXCEPTION);
+ } else {
+ siginfo_t status;
+ int r;
+
+ if (siginfo) {
+ if (siginfo->si_pid == 0)
+ log_emergency("Caught <%s> from unknown sender process.", signal_to_string(sig));
+ else if (siginfo->si_pid == 1)
+ log_emergency("Caught <%s> from our own process.", signal_to_string(sig));
+ else
+ log_emergency("Caught <%s> from PID "PID_FMT".", signal_to_string(sig), siginfo->si_pid);
+ }
+
+ /* Order things nicely. */
+ r = wait_for_terminate(pid, &status);
+ if (r < 0)
+ log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig));
+ else if (status.si_code != CLD_DUMPED) {
+ const char *s = status.si_code == CLD_EXITED
+ ? exit_status_to_string(status.si_status, EXIT_STATUS_LIBC)
+ : signal_to_string(status.si_status);
+
+ log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
+ signal_to_string(sig),
+ pid,
+ sigchld_code_to_string(status.si_code),
+ status.si_status, strna(s));
+ } else
+ log_emergency("Caught <%s>, dumped core as pid "PID_FMT".",
+ signal_to_string(sig), pid);
+ }
+ }
+
+ if (arg_crash_chvt >= 0)
+ (void) chvt(arg_crash_chvt);
+
+ sa = (struct sigaction) {
+ .sa_handler = SIG_IGN,
+ .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
+ };
+
+ /* Let the kernel reap children for us */
+ (void) sigaction(SIGCHLD, &sa, NULL);
+
+ if (arg_crash_shell) {
+ log_notice("Executing crash shell in 10s...");
+ (void) sleep(10);
+
+ pid = raw_clone(SIGCHLD);
+ if (pid < 0)
+ log_emergency_errno(errno, "Failed to fork off crash shell: %m");
+ else if (pid == 0) {
+ (void) setsid();
+ (void) make_console_stdio();
+ (void) rlimit_nofile_safe();
+ (void) execle("/bin/sh", "/bin/sh", NULL, environ);
+
+ log_emergency_errno(errno, "execle() failed: %m");
+ _exit(EXIT_EXCEPTION);
+ } else {
+ log_info("Spawned crash shell as PID "PID_FMT".", pid);
+ (void) wait_for_terminate(pid, NULL);
+ }
+ }
+
+ freeze_or_exit_or_reboot();
+}
+
+void install_crash_handler(void) {
+ static const struct sigaction sa = {
+ .sa_sigaction = crash,
+ .sa_flags = SA_NODEFER | SA_SIGINFO, /* So that we can raise the signal again from the signal handler */
+ };
+ int r;
+
+ /* We ignore the return value here, since, we don't mind if we cannot set up a crash handler */
+ r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER);
+ if (r < 0)
+ log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
+}
diff --git a/src/core/crash-handler.h b/src/core/crash-handler.h
new file mode 100644
index 0000000..dc14335
--- /dev/null
+++ b/src/core/crash-handler.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void);
+void install_crash_handler(void);
diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c
new file mode 100644
index 0000000..881bf50
--- /dev/null
+++ b/src/core/dbus-automount.c
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "automount.h"
+#include "bus-get-properties.h"
+#include "dbus-automount.h"
+#include "dbus-util.h"
+#include "string-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult);
+
+const sd_bus_vtable bus_automount_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtraOptions", "s", NULL, offsetof(Automount, extra_options), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_automount_set_transient_property(
+ Automount *a,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(a);
+
+ assert(a);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Where"))
+ return bus_set_transient_path(u, name, &a->where, message, flags, error);
+
+ if (streq(name, "ExtraOptions"))
+ return bus_set_transient_string(u, name, &a->extra_options, message, flags, error);
+
+ if (streq(name, "TimeoutIdleUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error);
+
+ return 0;
+}
+
+int bus_automount_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */
+ return bus_automount_set_transient_property(a, name, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h
new file mode 100644
index 0000000..cfceaec
--- /dev/null
+++ b/src/core/dbus-automount.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_automount_vtable[];
+
+int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
new file mode 100644
index 0000000..a57b7e8
--- /dev/null
+++ b/src/core/dbus-cgroup.c
@@ -0,0 +1,2073 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bus-get-properties.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "core-varlink.h"
+#include "dbus-cgroup.h"
+#include "dbus-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "in-addr-prefix-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "socket-util.h"
+
+BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve);
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference);
+
+static int property_get_cgroup_mask(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupMask *mask = userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (CGroupController ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
+ if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
+ continue;
+
+ r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_delegate_controllers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ if (!c->delegate)
+ return sd_bus_message_append(reply, "as", 0);
+
+ return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
+}
+
+static int property_get_cpuset(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CPUSet *cpus = ASSERT_PTR(userdata);
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) cpu_set_to_dbus(cpus, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_io_device_weight(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_weights, w, c->io_device_weights) {
+ r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_limits(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_limits, l, c->io_device_limits) {
+ CGroupIOLimitType type;
+
+ type = cgroup_io_limit_type_from_string(property);
+ if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type])
+ continue;
+
+ r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_latency(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_latencies, l, c->io_device_latencies) {
+ r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_weight(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+ r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_bandwidths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ uint64_t v;
+
+ if (streq(property, "BlockIOReadBandwidth"))
+ v = b->rbps;
+ else
+ v = b->wbps;
+
+ if (v == CGROUP_LIMIT_MAX)
+ continue;
+
+ r = sd_bus_message_append(reply, "(st)", b->path, v);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_device_allow(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_allow, a, c->device_allow) {
+ unsigned k = 0;
+ char rwm[4];
+
+ if (a->r)
+ rwm[k++] = 'r';
+ if (a->w)
+ rwm[k++] = 'w';
+ if (a->m)
+ rwm[k++] = 'm';
+
+ rwm[k] = 0;
+
+ r = sd_bus_message_append(reply, "(ss)", a->path, rwm);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_ip_address_access(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Set **prefixes = ASSERT_PTR(userdata);
+ struct in_addr_prefix *i;
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(iayu)");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(i, *prefixes) {
+
+ r = sd_bus_message_open_container(reply, 'r', "iayu");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "i", i->family);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family));
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bpf_foreign_program(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ CGroupContext *c = userdata;
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(programs, p, c->bpf_foreign_programs) {
+ const char *attach_type = bpf_cgroup_attach_type_to_string(p->attach_type);
+
+ r = sd_bus_message_append(reply, "(ss)", attach_type, p->bpffs_path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_socket_bind(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupSocketBindItem **items = ASSERT_PTR(userdata);
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(iiqq)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(socket_bind_items, i, *items) {
+ r = sd_bus_message_append(reply, "(iiqq)", i->address_family, i->ip_protocol, i->nr_ports, i->port_min);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_restrict_network_interfaces(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ int r;
+ CGroupContext *c = ASSERT_PTR(userdata);
+ char *iface;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->restrict_network_interfaces_is_allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(iface, c->restrict_network_interfaces) {
+ r = sd_bus_message_append(reply, "s", iface);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_cgroup_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
+ SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+ SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
+ SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
+ SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
+ SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0),
+ SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0),
+ SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0),
+ SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0),
+ SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0),
+ SD_BUS_PROPERTY("StartupAllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_cpus), 0),
+ SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0),
+ SD_BUS_PROPERTY("StartupAllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_mems), 0),
+ SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0),
+ SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0),
+ SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0),
+ SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0),
+ SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0),
+ SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0),
+ SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0),
+ SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0),
+ SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0),
+ SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+ SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+ SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0),
+ SD_BUS_PROPERTY("DefaultMemoryLow", "t", NULL, offsetof(CGroupContext, default_memory_low), 0),
+ SD_BUS_PROPERTY("DefaultMemoryMin", "t", NULL, offsetof(CGroupContext, default_memory_min), 0),
+ SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0),
+ SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0),
+ SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0),
+ SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0),
+ SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0),
+ SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
+ SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
+ SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
+ SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0),
+ SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0),
+ SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
+ SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
+ SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
+ SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0),
+ SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0),
+ SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
+ SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
+ SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
+ SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0),
+ SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
+ SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0),
+ SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_cgroup_set_transient_property(
+ Unit *u,
+ CGroupContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Delegate")) {
+ int b;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->delegate = b;
+ c->delegate_controllers = b ? _CGROUP_MASK_ALL : 0;
+
+ unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
+ CGroupMask mask = 0;
+
+ if (streq(name, "DelegateControllers") && !UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ CGroupController cc;
+ const char *t;
+
+ r = sd_bus_message_read(message, "s", &t);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ cc = cgroup_controller_from_string(t);
+ if (cc < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t);
+
+ mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *t = NULL;
+
+ r = cg_mask_to_string(mask, &t);
+ if (r < 0)
+ return r;
+
+ if (streq(name, "DelegateControllers")) {
+
+ c->delegate = true;
+ if (mask == 0)
+ c->delegate_controllers = 0;
+ else
+ c->delegate_controllers |= mask;
+
+ unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t));
+
+ } else if (streq(name, "DisableControllers")) {
+
+ if (mask == 0)
+ c->disable_controllers = 0;
+ else
+ c->disable_controllers |= mask;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, strempty(t));
+ }
+ }
+
+ return 1;
+ } else if (STR_IN_SET(name, "IPIngressFilterPath", "IPEgressFilterPath")) {
+ char ***filters;
+ size_t n = 0;
+
+ filters = streq(name, "IPIngressFilterPath") ? &c->ip_filters_ingress : &c->ip_filters_egress;
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *path;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (!path_is_normalized(path) || !path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects a normalized absolute path.", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && !strv_contains(*filters, path)) {
+ r = strv_extend(filters, path);
+ if (r < 0)
+ return log_oom();
+ }
+ n++;
+ }
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ *filters = strv_free(*filters);
+
+ unit_invalidate_cgroup_bpf(u);
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs(name, f);
+ fputs("=\n", f);
+
+ STRV_FOREACH(entry, *filters)
+ fprintf(f, "%s=%s\n", name, *entry);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+
+ if (*filters) {
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ static bool warned = false;
+
+ log_full(warned ? LOG_DEBUG : LOG_WARNING,
+ "Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n"
+ "Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id);
+ warned = true;
+ }
+ }
+ }
+
+ return 1;
+ } else if (streq(name, "BPFProgram")) {
+ const char *a, *p;
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &a, &p)) > 0) {
+ int attach_type = bpf_cgroup_attach_type_from_string(a);
+ if (attach_type < 0)
+ return sd_bus_error_setf(
+ error,
+ SD_BUS_ERROR_INVALID_ARGS,
+ "%s expects a valid BPF attach type, got '%s'.",
+ name, a);
+
+ if (!path_is_normalized(p) || !path_is_absolute(p))
+ return sd_bus_error_setf(
+ error,
+ SD_BUS_ERROR_INVALID_ARGS,
+ "%s= expects a normalized absolute path.",
+ name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = cgroup_add_bpf_foreign_program(c, attach_type, p);
+ if (r < 0)
+ return r;
+ }
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs(name, f);
+ fputs("=\n", f);
+
+ LIST_FOREACH(programs, fp, c->bpf_foreign_programs)
+ fprintf(f, "%s=%s:%s\n", name,
+ bpf_cgroup_attach_type_to_string(fp->attach_type),
+ fp->bpffs_path);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+
+ if (c->bpf_foreign_programs) {
+ r = bpf_foreign_supported();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ log_full(LOG_DEBUG,
+ "Transient unit %s configures a BPF program pinned to BPF "
+ "filesystem, but the local system does not support that.\n"
+ "Starting this unit will fail!", u->id);
+ }
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_cgroup_set_boolean(
+ Unit *u,
+ const char *name,
+ bool *p,
+ CGroupMask mask,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int b, r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = b;
+ unit_invalidate_cgroup(u, mask);
+ unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b));
+ }
+
+ return 1;
+}
+
+#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val) \
+ static int bus_cgroup_set_##function( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "t", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ if (v == (val)) \
+ unit_write_settingf(u, flags, name, \
+ "%s=", name); \
+ else \
+ unit_write_settingf(u, flags, name, \
+ "%s=%" PRIu64, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum) \
+ static int bus_cgroup_set_##function( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "t", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (v < minimum) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ if (v == CGROUP_LIMIT_MAX) \
+ unit_write_settingf(u, flags, name, \
+ "%s=infinity", name); \
+ else \
+ unit_write_settingf(u, flags, name, \
+ "%s=%" PRIu64, name, v); \
+ } \
+ \
+ return 1; \
+ } \
+ static int bus_cgroup_set_##function##_scale( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ uint32_t raw; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "u", &raw); \
+ if (r < 0) \
+ return r; \
+ \
+ v = scale(raw, UINT32_MAX); \
+ if (v < minimum || v >= UINT64_MAX) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ /* Prepare to chop off suffix */ \
+ assert_se(endswith(name, "Scale")); \
+ \
+ int scaled = UINT32_SCALE_TO_PERMYRIAD(raw); \
+ unit_write_settingf(u, flags, name, "%.*s=" PERMYRIAD_AS_PERCENT_FORMAT_STR, \
+ (int)(strlen(name) - strlen("Scale")), name, \
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(scaled)); \
+ } \
+ \
+ return 1; \
+ }
+
+DISABLE_WARNING_TYPE_LIMITS;
+BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+REENABLE_WARNING;
+
+static int bus_cgroup_set_cpu_weight(
+ Unit *u,
+ const char *name,
+ uint64_t *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+ uint64_t v;
+ int r;
+ assert(p);
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+ if (!CGROUP_WEIGHT_IS_OK(v) && v != CGROUP_WEIGHT_IDLE)
+ return sd_bus_error_setf(
+ error, SD_BUS_ERROR_INVALID_ARGS, "Value specified in %s is out of range", name);
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+ if (v == CGROUP_WEIGHT_INVALID)
+ unit_write_settingf(u, flags, name, "%s=", name);
+ else if (v == CGROUP_WEIGHT_IDLE)
+ unit_write_settingf(u, flags, name, "%s=idle", name);
+ else
+ unit_write_settingf(u, flags, name, "%s=%" PRIu64, name, v);
+ }
+ return 1;
+}
+
+static int bus_cgroup_set_tasks_max(
+ Unit *u,
+ const char *name,
+ TasksMax *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint64_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+
+ if (v < 1)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Value specified in %s is out of range", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = (TasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */
+ unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+ if (v == CGROUP_LIMIT_MAX)
+ unit_write_settingf(u, flags, name,
+ "%s=infinity", name);
+ else
+ unit_write_settingf(u, flags, name,
+ "%s=%" PRIu64, name, v);
+ }
+
+ return 1;
+}
+
+static int bus_cgroup_set_tasks_max_scale(
+ Unit *u,
+ const char *name,
+ TasksMax *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint32_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "u", &v);
+ if (r < 0)
+ return r;
+
+ if (v < 1 || v >= UINT32_MAX)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Value specified in %s is out of range", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = (TasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */
+ unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+ uint32_t scaled = DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX);
+ unit_write_settingf(u, flags, name, "%s=%" PRIu32 ".%" PRIu32 "%%", "TasksMax",
+ scaled / 10, scaled % 10);
+ }
+
+ return 1;
+}
+
+int bus_cgroup_set_property(
+ Unit *u,
+ CGroupContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ CGroupIOLimitType iol_type;
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "CPUAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error);
+
+ if (streq(name, "CPUWeight"))
+ return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error);
+
+ if (streq(name, "StartupCPUWeight"))
+ return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error);
+
+ if (streq(name, "CPUShares"))
+ return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error);
+
+ if (streq(name, "StartupCPUShares"))
+ return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error);
+
+ if (streq(name, "IOAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error);
+
+ if (streq(name, "IOWeight"))
+ return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error);
+
+ if (streq(name, "StartupIOWeight"))
+ return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error);
+
+ if (streq(name, "BlockIOAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error);
+
+ if (streq(name, "BlockIOWeight"))
+ return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error);
+
+ if (streq(name, "StartupBlockIOWeight"))
+ return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error);
+
+ if (streq(name, "MemoryAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error);
+
+ if (streq(name, "MemoryMin")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->memory_min, message, flags, error);
+ if (r > 0)
+ c->memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->memory_low, message, flags, error);
+ if (r > 0)
+ c->memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryMin")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_min, message, flags, error);
+ if (r > 0)
+ c->default_memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_low, message, flags, error);
+ if (r > 0)
+ c->default_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryHigh"))
+ return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error);
+
+ if (streq(name, "MemorySwapMax"))
+ return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error);
+
+ if (streq(name, "MemoryMax"))
+ return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error);
+
+ if (streq(name, "MemoryLimit"))
+ return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error);
+
+ if (streq(name, "MemoryMinScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_min, message, flags, error);
+ if (r > 0)
+ c->memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryLowScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_low, message, flags, error);
+ if (r > 0)
+ c->memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryMinScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_min, message, flags, error);
+ if (r > 0)
+ c->default_memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryLowScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_low, message, flags, error);
+ if (r > 0)
+ c->default_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryHighScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error);
+
+ if (streq(name, "MemorySwapMaxScale"))
+ return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error);
+
+ if (streq(name, "MemoryMaxScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error);
+
+ if (streq(name, "MemoryLimitScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error);
+
+ if (streq(name, "TasksAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error);
+
+ if (streq(name, "TasksMax"))
+ return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error);
+
+ if (streq(name, "TasksMaxScale"))
+ return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error);
+
+ if (streq(name, "CPUQuotaPerSecUSec")) {
+ uint64_t u64;
+
+ r = sd_bus_message_read(message, "t", &u64);
+ if (r < 0)
+ return r;
+
+ if (u64 <= 0)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_quota_per_sec_usec = u64;
+ u->warned_clamping_cpu_quota_period = false;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+
+ if (c->cpu_quota_per_sec_usec == USEC_INFINITY)
+ unit_write_setting(u, flags, "CPUQuota", "CPUQuota=");
+ else
+ /* config_parse_cpu_quota() requires an integer, so truncating division is used on
+ * purpose here. */
+ unit_write_settingf(u, flags, "CPUQuota",
+ "CPUQuota=%0.f%%",
+ (double) (c->cpu_quota_per_sec_usec / 10000));
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUQuotaPeriodUSec")) {
+ uint64_t u64;
+
+ r = sd_bus_message_read(message, "t", &u64);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_quota_period_usec = u64;
+ u->warned_clamping_cpu_quota_period = false;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+ if (c->cpu_quota_period_usec == USEC_INFINITY)
+ unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec=");
+ else
+ unit_write_settingf(u, flags, "CPUQuotaPeriodSec",
+ "CPUQuotaPeriodSec=%s",
+ FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "AllowedCPUs", "StartupAllowedCPUs", "AllowedMemoryNodes", "StartupAllowedMemoryNodes")) {
+ const void *a;
+ size_t n;
+ _cleanup_(cpu_set_reset) CPUSet new_set = {};
+
+ r = sd_bus_message_read_array(message, 'y', &a, &n);
+ if (r < 0)
+ return r;
+
+ r = cpu_set_from_dbus(a, n, &new_set);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *setstr = NULL;
+ CPUSet *set = NULL;
+
+ setstr = cpu_set_to_range_string(&new_set);
+ if (!setstr)
+ return -ENOMEM;
+
+ if (streq(name, "AllowedCPUs"))
+ set = &c->cpuset_cpus;
+ else if (streq(name, "StartupAllowedCPUs"))
+ set = &c->startup_cpuset_cpus;
+ else if (streq(name, "AllowedMemoryNodes"))
+ set = &c->cpuset_mems;
+ else if (streq(name, "StartupAllowedMemoryNodes"))
+ set = &c->startup_cpuset_mems;
+
+ assert(set);
+
+ cpu_set_reset(set);
+ *set = new_set;
+ new_set = (CPUSet) {};
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET);
+ unit_write_settingf(u, flags, name, "%s=\n%s=%s", name, name, setstr);
+ }
+
+ return 1;
+
+ } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) {
+ const char *path;
+ unsigned n = 0;
+ uint64_t u64;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceLimit *a = NULL;
+
+ LIST_FOREACH(device_limits, b, c->io_device_limits)
+ if (path_equal(path, b->path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ CGroupIOLimitType type;
+
+ a = new0(CGroupIODeviceLimit, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+
+ for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ a->limits[type] = cgroup_io_limit_defaults[type];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, a);
+ }
+
+ a->limits[iol_type] = u64;
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ LIST_FOREACH(device_limits, a, c->io_device_limits)
+ a->limits[iol_type] = cgroup_io_limit_defaults[iol_type];
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fprintf(f, "%s=\n", name);
+ LIST_FOREACH(device_limits, a, c->io_device_limits)
+ if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type])
+ fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IODeviceWeight")) {
+ const char *path;
+ uint64_t weight;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceWeight *a = NULL;
+
+ LIST_FOREACH(device_weights, b, c->io_device_weights)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceWeight, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_weights, c->io_device_weights, a);
+ }
+
+ a->weight = weight;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("IODeviceWeight=\n", f);
+ LIST_FOREACH(device_weights, a, c->io_device_weights)
+ fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IODeviceLatencyTargetUSec")) {
+ const char *path;
+ uint64_t target;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceLatency *a = NULL;
+
+ LIST_FOREACH(device_latencies, b, c->io_device_latencies)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceLatency, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_latencies, c->io_device_latencies, a);
+ }
+
+ a->target_usec = target;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("IODeviceLatencyTargetSec=\n", f);
+ LIST_FOREACH(device_latencies, a, c->io_device_latencies)
+ fprintf(f, "IODeviceLatencyTargetSec=%s %s\n",
+ a->path, FORMAT_TIMESPAN(a->target_usec, 1));
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) {
+ const char *path;
+ unsigned n = 0;
+ uint64_t u64;
+ bool read;
+
+ read = streq(name, "BlockIOReadBandwidth");
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupBlockIODeviceBandwidth *a = NULL;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ if (path_equal(path, b->path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->rbps = CGROUP_LIMIT_MAX;
+ a->wbps = CGROUP_LIMIT_MAX;
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+ }
+
+ if (read)
+ a->rbps = u64;
+ else
+ a->wbps = u64;
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) {
+ if (read)
+ a->rbps = CGROUP_LIMIT_MAX;
+ else
+ a->wbps = CGROUP_LIMIT_MAX;
+ }
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ if (read) {
+ fputs("BlockIOReadBandwidth=\n", f);
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+ if (a->rbps != CGROUP_LIMIT_MAX)
+ fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps);
+ } else {
+ fputs("BlockIOWriteBandwidth=\n", f);
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+ if (a->wbps != CGROUP_LIMIT_MAX)
+ fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps);
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "BlockIODeviceWeight")) {
+ const char *path;
+ uint64_t weight;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupBlockIODeviceWeight *a = NULL;
+
+ LIST_FOREACH(device_weights, b, c->blockio_device_weights)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceWeight, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_weights, c->blockio_device_weights, a);
+ }
+
+ a->weight = weight;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("BlockIODeviceWeight=\n", f);
+ LIST_FOREACH(device_weights, a, c->blockio_device_weights)
+ fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "DevicePolicy")) {
+ const char *policy;
+ CGroupDevicePolicy p;
+
+ r = sd_bus_message_read(message, "s", &policy);
+ if (r < 0)
+ return r;
+
+ p = cgroup_device_policy_from_string(policy);
+ if (p < 0)
+ return p;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->device_policy = p;
+ unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+ unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy);
+ }
+
+ return 1;
+
+ } else if (streq(name, "DeviceAllow")) {
+ const char *path, *rwm;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) {
+
+ if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern");
+
+ if (isempty(rwm))
+ rwm = "rwm";
+ else if (!in_charset(rwm, "rwm"))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupDeviceAllow *a = NULL;
+
+ LIST_FOREACH(device_allow, b, c->device_allow)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupDeviceAllow, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+
+ LIST_PREPEND(device_allow, c->device_allow, a);
+ }
+
+ a->r = strchr(rwm, 'r');
+ a->w = strchr(rwm, 'w');
+ a->m = strchr(rwm, 'm');
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("DeviceAllow=\n", f);
+ LIST_FOREACH(device_allow, a, c->device_allow)
+ fprintf(f, "DeviceAllow=%s %s%s%s\n", a->path, a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IPAccounting")) {
+ int b;
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->ip_accounting = b;
+
+ unit_invalidate_cgroup_bpf(u);
+ unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) {
+ _cleanup_set_free_ Set *new_prefixes = NULL;
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(iayu)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const void *ap;
+ int32_t family;
+ uint32_t prefixlen;
+ size_t an;
+
+ r = sd_bus_message_enter_container(message, 'r', "iayu");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = sd_bus_message_read(message, "i", &family);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(family, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name);
+
+ r = sd_bus_message_read_array(message, 'y', &ap, &an);
+ if (r < 0)
+ return r;
+
+ if (an != FAMILY_ADDRESS_SIZE(family))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)",
+ af_to_name(family), FAMILY_ADDRESS_SIZE(family), an);
+
+ r = sd_bus_message_read(message, "u", &prefixlen);
+ if (r < 0)
+ return r;
+
+ if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family));
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ struct in_addr_prefix prefix = {
+ .family = family,
+ .prefixlen = prefixlen,
+ };
+
+ memcpy(&prefix.address, ap, an);
+
+ r = in_addr_prefix_add(&new_prefixes, &prefix);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+ Set **prefixes;
+ bool *reduced;
+
+ unit_invalidate_cgroup_bpf(u);
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ prefixes = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny;
+ reduced = streq(name, "IPAddressAllow") ? &c->ip_address_allow_reduced : &c->ip_address_deny_reduced;
+
+ if (n == 0) {
+ *reduced = true;
+ *prefixes = set_free(*prefixes);
+ fputs(name, f);
+ fputs("=\n", f);
+ } else {
+ *reduced = false;
+
+ r = in_addr_prefixes_merge(prefixes, new_prefixes);
+ if (r < 0)
+ return r;
+
+ const struct in_addr_prefix *p;
+ SET_FOREACH(p, new_prefixes)
+ fprintf(f, "%s=%s\n", name,
+ IN_ADDR_PREFIX_TO_STRING(p->family, &p->address, p->prefixlen));
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+ }
+
+ if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) {
+ ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure;
+ ManagedOOMMode m;
+ const char *mode;
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = sd_bus_message_read(message, "s", &mode);
+ if (r < 0)
+ return r;
+
+ m = managed_oom_mode_from_string(mode);
+ if (m < 0)
+ return -EINVAL;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *cgroup_mode = m;
+ unit_write_settingf(u, flags, name, "%s=%s", name, mode);
+ }
+
+ (void) manager_varlink_send_managed_oom_update(u);
+ return 1;
+ }
+
+ if (streq(name, "ManagedOOMMemoryPressureLimit")) {
+ uint32_t v;
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = sd_bus_message_read(message, "u", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->moom_mem_pressure_limit = v;
+ unit_write_settingf(u, flags, name,
+ "ManagedOOMMemoryPressureLimit=" PERMYRIAD_AS_PERCENT_FORMAT_STR,
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(v)));
+ }
+
+ if (c->moom_mem_pressure == MANAGED_OOM_KILL)
+ (void) manager_varlink_send_managed_oom_update(u);
+
+ return 1;
+ }
+
+ if (streq(name, "ManagedOOMPreference")) {
+ ManagedOOMPreference p;
+ const char *pref;
+
+ r = sd_bus_message_read(message, "s", &pref);
+ if (r < 0)
+ return r;
+
+ p = managed_oom_preference_from_string(pref);
+ if (p < 0)
+ return p;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->moom_preference = p;
+ unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref);
+ }
+
+ return 1;
+ }
+ if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) {
+ CGroupSocketBindItem **list;
+ uint16_t nr_ports, port_min;
+ size_t n = 0;
+ int32_t family, ip_protocol;
+
+ list = streq(name, "SocketBindAllow") ? &c->socket_bind_allow : &c->socket_bind_deny;
+
+ r = sd_bus_message_enter_container(message, 'a', "(iiqq)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(iiqq)", &family, &ip_protocol, &nr_ports, &port_min)) > 0) {
+
+ if (!IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects INET or INET6 family, if specified.", name);
+
+ if (!IN_SET(ip_protocol, 0, IPPROTO_TCP, IPPROTO_UDP))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects TCP or UDP protocol, if specified.", name);
+
+ if (port_min + (uint32_t) nr_ports > (1 << 16))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects maximum port value lesser than 65536.", name);
+
+ if (port_min == 0 && nr_ports != 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects port range starting with positive value.", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ CGroupSocketBindItem *item = NULL;
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom();
+
+ *item = (CGroupSocketBindItem) {
+ .address_family = family,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min
+ };
+
+ LIST_PREPEND(socket_bind_items, *list, TAKE_PTR(item));
+ }
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ cgroup_context_remove_socket_bind(list);
+ else {
+ if ((u->manager->cgroup_supported & CGROUP_MASK_BPF_SOCKET_BIND) == 0)
+ log_full(LOG_DEBUG,
+ "Unit %s configures source compiled BPF programs "
+ "but the local system does not support that.\n"
+ "Starting this unit will fail!", u->id);
+ }
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ if (n == 0)
+ fprintf(f, "%s=\n", name);
+ else
+ LIST_FOREACH(socket_bind_items, item, *list) {
+ fprintf(f, "%s=", name);
+ cgroup_context_dump_socket_bind_item(item, f);
+ fputc('\n', f);
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+ }
+ if (streq(name, "RestrictNetworkInterfaces")) {
+ int is_allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &is_allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l)) {
+ c->restrict_network_interfaces_is_allow_list = false;
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ return 1;
+ }
+
+ if (set_isempty(c->restrict_network_interfaces))
+ c->restrict_network_interfaces_is_allow_list = is_allow_list;
+
+ STRV_FOREACH(s, l) {
+ if (!ifname_valid(*s)) {
+ log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s);
+ continue;
+ }
+ if (c->restrict_network_interfaces_is_allow_list != (bool) is_allow_list)
+ free(set_remove(c->restrict_network_interfaces, *s));
+ else {
+ r = set_put_strdup(&c->restrict_network_interfaces, *s);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s%s", name, is_allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
+ return bus_cgroup_set_transient_property(u, c, name, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h
new file mode 100644
index 0000000..5bf45eb
--- /dev/null
+++ b/src/core/dbus-cgroup.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+#include "cgroup.h"
+
+extern const sd_bus_vtable bus_cgroup_vtable[];
+
+int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c
new file mode 100644
index 0000000..b5e18d8
--- /dev/null
+++ b/src/core/dbus-device.c
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-device.h"
+#include "device.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_device_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h
new file mode 100644
index 0000000..bfb5770
--- /dev/null
+++ b/src/core/dbus-device.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_device_vtable[];
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
new file mode 100644
index 0000000..b4c4cd9
--- /dev/null
+++ b/src/core/dbus-execute.c
@@ -0,0 +1,3802 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <sys/prctl.h>
+
+#if HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "dbus-execute.h"
+#include "dbus-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "ioprio-util.h"
+#include "journal-file.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+#include "securebits-util.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
+static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio);
+static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_class);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_data);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
+
+static int property_get_environment_files(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sb)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(j, c->environment_files) {
+ const char *fn = *j;
+
+ r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-');
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_oom_score_adjust(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r, n;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->oom_score_adjust_set)
+ n = c->oom_score_adjust;
+ else {
+ n = 0;
+ r = get_oom_score_adjust(&n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /proc/self/oom_score_adj, ignoring: %m");
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int property_get_coredump_filter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ uint64_t n;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->coredump_filter_set)
+ n = c->coredump_filter;
+ else {
+ _cleanup_free_ char *t = NULL;
+
+ n = COREDUMP_FILTER_MASK_DEFAULT;
+ r = read_one_line_file("/proc/self/coredump_filter", &t);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /proc/self/coredump_filter, ignoring: %m");
+ else {
+ r = safe_atoux64(t, &n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/coredump_filter, ignoring: %m", t);
+ }
+ }
+
+ return sd_bus_message_append(reply, "t", n);
+}
+
+static int property_get_nice(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int32_t n;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->nice_set)
+ n = c->nice;
+ else {
+ errno = 0;
+ n = getpriority(PRIO_PROCESS, 0);
+ if (errno > 0)
+ n = 0;
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int property_get_cpu_sched_policy(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int32_t n;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->cpu_sched_set)
+ n = c->cpu_sched_policy;
+ else {
+ n = sched_getscheduler(0);
+ if (n < 0)
+ n = SCHED_OTHER;
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int property_get_cpu_sched_priority(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int32_t n;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->cpu_sched_set)
+ n = c->cpu_sched_priority;
+ else {
+ struct sched_param p = {};
+
+ if (sched_getparam(0, &p) >= 0)
+ n = p.sched_priority;
+ else
+ n = 0;
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int property_get_cpu_affinity(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->cpu_affinity_from_numa) {
+ int r;
+
+ r = numa_to_cpu_set(&c->numa_policy, &s);
+ if (r < 0)
+ return r;
+ }
+
+ (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set, &array, &allocated);
+
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_mask(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
+
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_policy(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ ExecContext *c = ASSERT_PTR(userdata);
+ int32_t policy;
+
+ assert(bus);
+ assert(reply);
+
+ policy = numa_policy_get_type(&c->numa_policy);
+
+ return sd_bus_message_append_basic(reply, 'i', &policy);
+}
+
+static int property_get_timer_slack_nsec(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ uint64_t u;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->timer_slack_nsec != NSEC_INFINITY)
+ u = (uint64_t) c->timer_slack_nsec;
+ else
+ u = (uint64_t) prctl(PR_GET_TIMERSLACK);
+
+ return sd_bus_message_append(reply, "t", u);
+}
+
+static int property_get_syscall_filter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->syscall_allow_list);
+ if (r < 0)
+ return r;
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+ _cleanup_free_ char *name = NULL;
+ const char *e = NULL;
+ char *s;
+ int num = PTR_TO_INT(val);
+
+ if (c->syscall_allow_list && num >= 0)
+ /* syscall with num >= 0 in allow-list is denied. */
+ continue;
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ if (!name)
+ continue;
+
+ if (num >= 0) {
+ e = seccomp_errno_or_action_to_string(num);
+ if (e) {
+ s = strjoin(name, ":", e);
+ if (!s)
+ return -ENOMEM;
+ } else {
+ r = asprintf(&s, "%s:%d", name, num);
+ if (r < 0)
+ return -ENOMEM;
+ }
+ } else
+ s = TAKE_PTR(name);
+
+ r = strv_consume(&l, s);
+ if (r < 0)
+ return r;
+ }
+#endif
+
+ strv_sort(l);
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_log(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->syscall_log_allow_list);
+ if (r < 0)
+ return r;
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_log) {
+ char *name = NULL;
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ if (!name)
+ continue;
+
+ r = strv_consume(&l, name);
+ if (r < 0)
+ return r;
+ }
+#endif
+
+ strv_sort(l);
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_archs(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+#if HAVE_SECCOMP
+ void *id;
+ SET_FOREACH(id, ASSERT_PTR((ExecContext*) userdata)->syscall_archs) {
+ const char *name;
+
+ name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1);
+ if (!name)
+ continue;
+
+ r = strv_extend(&l, name);
+ if (r < 0)
+ return -ENOMEM;
+ }
+#endif
+
+ strv_sort(l);
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int property_get_selinux_context(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context);
+}
+
+static int property_get_apparmor_profile(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile);
+}
+
+static int property_get_smack_process_label(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label);
+}
+
+static int property_get_address_families(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ void *af;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->address_families_allow_list);
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(af, c->address_families) {
+ const char *name;
+
+ name = af_to_name(PTR_TO_INT(af));
+ if (!name)
+ continue;
+
+ r = strv_extend(&l, name);
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ strv_sort(l);
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_working_directory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ const char *wd;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->working_directory_home)
+ wd = "~";
+ else
+ wd = c->working_directory;
+
+ if (c->working_directory_missing_ok)
+ wd = strjoina("!", wd);
+
+ return sd_bus_message_append(reply, "s", wd);
+}
+
+static int property_get_stdio_fdname(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int fileno;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ if (streq(property, "StandardInputFileDescriptorName"))
+ fileno = STDIN_FILENO;
+ else if (streq(property, "StandardOutputFileDescriptorName"))
+ fileno = STDOUT_FILENO;
+ else {
+ assert(streq(property, "StandardErrorFileDescriptorName"));
+ fileno = STDERR_FILENO;
+ }
+
+ return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno));
+}
+
+static int property_get_input_data(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size);
+}
+
+static int property_get_restrict_filesystems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->restrict_filesystems_allow_list);
+ if (r < 0)
+ return r;
+
+#if HAVE_LIBBPF
+ l = set_get_strv(c->restrict_filesystems);
+ if (!l)
+ return -ENOMEM;
+#endif
+
+ strv_sort(l);
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bind_paths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ bool ro;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ ro = strstr(property, "ReadOnly");
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssbt)");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_bind_mounts; i++) {
+
+ if (ro != c->bind_mounts[i].read_only)
+ continue;
+
+ r = sd_bus_message_append(
+ reply, "(ssbt)",
+ c->bind_mounts[i].source,
+ c->bind_mounts[i].destination,
+ c->bind_mounts[i].ignore_enoent,
+ c->bind_mounts[i].recursive ? (uint64_t) MS_REC : (uint64_t) 0);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_temporary_filesystems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ for (unsigned i = 0; i < c->n_temporary_filesystems; i++) {
+ TemporaryFileSystem *t = c->temporary_filesystems + i;
+
+ r = sd_bus_message_append(
+ reply, "(ss)",
+ t->path,
+ t->options);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_log_extra_fields(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "ay");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+ r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_set_credential(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ ExecSetCredential *sc;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(say)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(sc, c->set_credentials) {
+
+ if (sc->encrypted != streq(property, "SetCredentialEncrypted"))
+ continue;
+
+ r = sd_bus_message_open_container(reply, 'r', "say");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", sc->id);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(reply, 'y', sc->data, sc->size);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_credential(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ ExecLoadCredential *lc;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(lc, c->load_credentials) {
+
+ if (lc->encrypted != streq(property, "LoadCredentialEncrypted"))
+ continue;
+
+ r = sd_bus_message_append(reply, "(ss)", lc->id, lc->path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_root_hash(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->root_hash, c->root_hash_size);
+}
+
+static int property_get_root_hash_sig(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->root_hash_sig, c->root_hash_sig_size);
+}
+
+static int property_get_root_image_options(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(mount_options, m, c->root_image_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_mount_images(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssba(ss))");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_mount_images; i++) {
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "ssba(ss)");
+ if (r < 0)
+ return r;
+ r = sd_bus_message_append(
+ reply, "ssb",
+ c->mount_images[i].source,
+ c->mount_images[i].destination,
+ c->mount_images[i].ignore_enoent);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+ LIST_FOREACH(mount_options, m, c->mount_images[i].mount_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_extension_images(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sba(ss))");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_extension_images; i++) {
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "sba(ss)");
+ if (r < 0)
+ return r;
+ r = sd_bus_message_append(
+ reply, "sb",
+ c->extension_images[i].source,
+ c->extension_images[i].ignore_enoent);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+ LIST_FOREACH(mount_options, m, c->extension_images[i].mount_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecDirectory *d = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < d->n_items; i++) {
+ r = sd_bus_message_append_basic(reply, 's', d->items[i].path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir_symlink(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecDirectory *d = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < d->n_items; i++)
+ STRV_FOREACH(dst, d->items[i].symlinks) {
+ r = sd_bus_message_append(reply, "(sst)", d->items[i].path, *dst, 0 /* flags, unused for now */);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_exec_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImageOptions", "a(ss)", property_get_root_image_options, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHash", "ay", property_get_root_hash, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashPath", "s", NULL, offsetof(ExecContext, root_hash_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CoredumpFilter", "t", property_get_coredump_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYRows", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_rows), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYColumns", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_cols), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_ratelimit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_ratelimit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogNamespace", "s", NULL, offsetof(ExecContext, log_namespace), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadCredential", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadCredentialEncrypted", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExecPaths", "as", NULL, offsetof(ExecContext, exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_bool, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallLog", "(bas)", property_get_syscall_log, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConfigurationDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutCleanUSec", "t", bus_property_get_usec, offsetof(ExecContext, timeout_clean_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountAPIVFS", "b", property_get_mount_apivfs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ /* Obsolete/redundant properties: */
+ SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+ SD_BUS_VTABLE_END
+};
+
+static int append_exec_command(sd_bus_message *reply, ExecCommand *c) {
+ int r;
+
+ assert(reply);
+ assert(c);
+
+ if (!c->path)
+ return 0;
+
+ r = sd_bus_message_open_container(reply, 'r', "sasbttttuii");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", c->path);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, c->argv);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "bttttuii",
+ !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE),
+ c->exec_status.start_timestamp.realtime,
+ c->exec_status.start_timestamp.monotonic,
+ c->exec_status.exit_timestamp.realtime,
+ c->exec_status.exit_timestamp.monotonic,
+ (uint32_t) c->exec_status.pid,
+ (int32_t) c->exec_status.code,
+ (int32_t) c->exec_status.status);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int append_exec_ex_command(sd_bus_message *reply, ExecCommand *c) {
+ _cleanup_strv_free_ char **ex_opts = NULL;
+ int r;
+
+ assert(reply);
+ assert(c);
+
+ if (!c->path)
+ return 0;
+
+ r = sd_bus_message_open_container(reply, 'r', "sasasttttuii");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", c->path);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, c->argv);
+ if (r < 0)
+ return r;
+
+ r = exec_command_flags_to_strv(c->flags, &ex_opts);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, ex_opts);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "ttttuii",
+ c->exec_status.start_timestamp.realtime,
+ c->exec_status.start_timestamp.monotonic,
+ c->exec_status.exit_timestamp.realtime,
+ c->exec_status.exit_timestamp.monotonic,
+ (uint32_t) c->exec_status.pid,
+ (int32_t) c->exec_status.code,
+ (int32_t) c->exec_status.status);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *c = (ExecCommand*) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+ if (r < 0)
+ return r;
+
+ r = append_exec_command(reply, c);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command_list(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *exec_command = *(ExecCommand**) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(command, c, exec_command) {
+ r = append_exec_command(reply, c);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_ex_command_list(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *exec_command = *(ExecCommand**) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasasttttuii)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(command, c, exec_command) {
+ r = append_exec_ex_command(reply, c);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static char *exec_command_flags_to_exec_chars(ExecCommandFlags flags) {
+ return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE) ? "-" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND) ? ":" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC) ? "!!" : "");
+}
+
+int bus_set_transient_exec_command(
+ Unit *u,
+ const char *name,
+ ExecCommand **exec_command,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+ bool is_ex_prop = endswith(name, "Ex");
+ unsigned n = 0;
+ int r;
+
+ r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) {
+ _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL;
+ const char *path;
+ int b;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path) && !filename_is_valid(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "\"%s\" is neither a valid executable name nor an absolute path",
+ path);
+
+ r = sd_bus_message_read_strv(message, &argv);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(argv))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "\"%s\" argv cannot be empty", name);
+
+ r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ ExecCommand *c;
+
+ c = new0(ExecCommand, 1);
+ if (!c)
+ return -ENOMEM;
+
+ c->path = strdup(path);
+ if (!c->path) {
+ free(c);
+ return -ENOMEM;
+ }
+
+ c->argv = TAKE_PTR(argv);
+
+ if (is_ex_prop) {
+ r = exec_command_flags_from_strv(ex_opts, &c->flags);
+ if (r < 0)
+ return r;
+ } else
+ c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0;
+
+ path_simplify(c->path);
+ exec_command_append_list(exec_command, c);
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size = 0;
+
+ if (n == 0)
+ *exec_command = exec_command_free_list(*exec_command);
+
+ f = open_memstream_unlocked(&buf, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fprintf(f, "%s=\n", name);
+
+ LIST_FOREACH(command, c, *exec_command) {
+ _cleanup_free_ char *a = NULL, *exec_chars = NULL;
+
+ exec_chars = exec_command_flags_to_exec_chars(c->flags);
+ if (!exec_chars)
+ return -ENOMEM;
+
+ a = unit_concat_strv(c->argv, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_EXEC_SYNTAX);
+ if (!a)
+ return -ENOMEM;
+
+ if (streq_ptr(c->path, c->argv ? c->argv[0] : NULL))
+ fprintf(f, "%s=%s%s\n", name, exec_chars, a);
+ else {
+ _cleanup_free_ char *t = NULL;
+ const char *p;
+
+ p = unit_escape_setting(c->path,
+ UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_EXEC_SYNTAX, &t);
+ if (!p)
+ return -ENOMEM;
+
+ fprintf(f, "%s=%s@%s %s\n", name, exec_chars, p, a);
+ }
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+}
+
+static int parse_personality(const char *s, unsigned long *p) {
+ unsigned long v;
+
+ assert(p);
+
+ v = personality_from_string(s);
+ if (v == PERSONALITY_INVALID)
+ return -EINVAL;
+
+ *p = v;
+ return 0;
+}
+
+static const char* mount_propagation_flags_to_string_with_check(unsigned long n) {
+ if (!IN_SET(n, 0, MS_SHARED, MS_PRIVATE, MS_SLAVE))
+ return NULL;
+
+ return mount_propagation_flags_to_string(n);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid);
+#if HAVE_SECCOMP
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, seccomp_errno_or_action_is_valid);
+#endif
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string_alloc);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_flags, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flags_to_string_with_check);
+
+int bus_exec_context_set_transient_property(
+ Unit *u,
+ ExecContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *suffix;
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "User"))
+ return bus_set_transient_user_relaxed(u, name, &c->user, message, flags, error);
+
+ if (streq(name, "Group"))
+ return bus_set_transient_user_relaxed(u, name, &c->group, message, flags, error);
+
+ if (streq(name, "TTYPath"))
+ return bus_set_transient_path(u, name, &c->tty_path, message, flags, error);
+
+ if (streq(name, "RootImage"))
+ return bus_set_transient_path(u, name, &c->root_image, message, flags, error);
+
+ if (streq(name, "RootImageOptions")) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *format_str = NULL;
+
+ r = bus_read_mount_options(message, error, &options, &format_str, " ");
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (options) {
+ LIST_JOIN(mount_options, c->root_image_options, options);
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s",
+ name,
+ format_str);
+ } else {
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHash")) {
+ const void *roothash_decoded;
+ size_t roothash_decoded_size;
+
+ r = sd_bus_message_read_array(message, 'y', &roothash_decoded, &roothash_decoded_size);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (roothash_decoded_size == 0) {
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+
+ unit_write_settingf(u, flags, name, "RootHash=");
+ } else {
+ _cleanup_free_ void *p = NULL;
+
+ encoded = hexmem(roothash_decoded, roothash_decoded_size);
+ if (!encoded)
+ return -ENOMEM;
+
+ p = memdup(roothash_decoded, roothash_decoded_size);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash, p);
+ c->root_hash_size = roothash_decoded_size;
+ c->root_hash_path = mfree(c->root_hash_path);
+
+ unit_write_settingf(u, flags, name, "RootHash=%s", encoded);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHashPath")) {
+ c->root_hash_size = 0;
+ c->root_hash = mfree(c->root_hash);
+
+ return bus_set_transient_path(u, "RootHash", &c->root_hash_path, message, flags, error);
+ }
+
+ if (streq(name, "RootHashSignature")) {
+ const void *roothash_sig_decoded;
+ size_t roothash_sig_decoded_size;
+
+ r = sd_bus_message_read_array(message, 'y', &roothash_sig_decoded, &roothash_sig_decoded_size);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (roothash_sig_decoded_size == 0) {
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+
+ unit_write_settingf(u, flags, name, "RootHashSignature=");
+ } else {
+ _cleanup_free_ void *p = NULL;
+ ssize_t len;
+
+ len = base64mem(roothash_sig_decoded, roothash_sig_decoded_size, &encoded);
+ if (len < 0)
+ return -ENOMEM;
+
+ p = memdup(roothash_sig_decoded, roothash_sig_decoded_size);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash_sig, p);
+ c->root_hash_sig_size = roothash_sig_decoded_size;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+ unit_write_settingf(u, flags, name, "RootHashSignature=base64:%s", encoded);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHashSignaturePath")) {
+ c->root_hash_sig_size = 0;
+ c->root_hash_sig = mfree(c->root_hash_sig);
+
+ return bus_set_transient_path(u, "RootHashSignature", &c->root_hash_sig_path, message, flags, error);
+ }
+
+ if (streq(name, "RootVerity"))
+ return bus_set_transient_path(u, name, &c->root_verity, message, flags, error);
+
+ if (streq(name, "RootDirectory"))
+ return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
+
+ if (streq(name, "SyslogIdentifier"))
+ return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
+
+ if (streq(name, "LogLevelMax"))
+ return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error);
+
+ if (streq(name, "LogRateLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &c->log_ratelimit_interval_usec, message, flags, error);
+
+ if (streq(name, "LogRateLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &c->log_ratelimit_burst, message, flags, error);
+
+ if (streq(name, "Personality"))
+ return bus_set_transient_personality(u, name, &c->personality, message, flags, error);
+
+ if (streq(name, "StandardInput"))
+ return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error);
+
+ if (streq(name, "StandardOutput"))
+ return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error);
+
+ if (streq(name, "StandardError"))
+ return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error);
+
+ if (streq(name, "IgnoreSIGPIPE"))
+ return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error);
+
+ if (streq(name, "TTYVHangup"))
+ return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error);
+
+ if (streq(name, "TTYReset"))
+ return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error);
+
+ if (streq(name, "TTYVTDisallocate"))
+ return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error);
+
+ if (streq(name, "TTYRows"))
+ return bus_set_transient_unsigned(u, name, &c->tty_rows, message, flags, error);
+
+ if (streq(name, "TTYColumns"))
+ return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error);
+
+ if (streq(name, "PrivateTmp"))
+ return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error);
+
+ if (streq(name, "PrivateDevices"))
+ return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
+
+ if (streq(name, "PrivateMounts"))
+ return bus_set_transient_bool(u, name, &c->private_mounts, message, flags, error);
+
+ if (streq(name, "PrivateNetwork"))
+ return bus_set_transient_bool(u, name, &c->private_network, message, flags, error);
+
+ if (streq(name, "PrivateIPC"))
+ return bus_set_transient_bool(u, name, &c->private_ipc, message, flags, error);
+
+ if (streq(name, "PrivateUsers"))
+ return bus_set_transient_bool(u, name, &c->private_users, message, flags, error);
+
+ if (streq(name, "NoNewPrivileges"))
+ return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error);
+
+ if (streq(name, "SyslogLevelPrefix"))
+ return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error);
+
+ if (streq(name, "MemoryDenyWriteExecute"))
+ return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error);
+
+ if (streq(name, "RestrictRealtime"))
+ return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error);
+
+ if (streq(name, "RestrictSUIDSGID"))
+ return bus_set_transient_bool(u, name, &c->restrict_suid_sgid, message, flags, error);
+
+ if (streq(name, "DynamicUser"))
+ return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error);
+
+ if (streq(name, "RemoveIPC"))
+ return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error);
+
+ if (streq(name, "ProtectKernelTunables"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error);
+
+ if (streq(name, "ProtectKernelModules"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error);
+
+ if (streq(name, "ProtectKernelLogs"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_logs, message, flags, error);
+
+ if (streq(name, "ProtectClock"))
+ return bus_set_transient_bool(u, name, &c->protect_clock, message, flags, error);
+
+ if (streq(name, "ProtectControlGroups"))
+ return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error);
+
+ if (streq(name, "CPUSchedulingResetOnFork"))
+ return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error);
+
+ if (streq(name, "NonBlocking"))
+ return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error);
+
+ if (streq(name, "LockPersonality"))
+ return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error);
+
+ if (streq(name, "ProtectHostname"))
+ return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error);
+
+ if (streq(name, "UtmpIdentifier"))
+ return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error);
+
+ if (streq(name, "UtmpMode"))
+ return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error);
+
+ if (streq(name, "PAMName"))
+ return bus_set_transient_string(u, name, &c->pam_name, message, flags, error);
+
+ if (streq(name, "TimerSlackNSec"))
+ return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error);
+
+ if (streq(name, "ProtectSystem"))
+ return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error);
+
+ if (streq(name, "ProtectHome"))
+ return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error);
+
+ if (streq(name, "KeyringMode"))
+ return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
+
+ if (streq(name, "ProtectProc"))
+ return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
+
+ if (streq(name, "ProcSubset"))
+ return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+
+ if (streq(name, "RuntimeDirectoryPreserve"))
+ return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
+
+ if (streq(name, "UMask"))
+ return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error);
+
+ if (streq(name, "RuntimeDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error);
+
+ if (streq(name, "StateDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error);
+
+ if (streq(name, "CacheDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error);
+
+ if (streq(name, "LogsDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error);
+
+ if (streq(name, "ConfigurationDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error);
+
+ if (streq(name, "SELinuxContext"))
+ return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error);
+
+ if (streq(name, "SecureBits"))
+ return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error);
+
+ if (streq(name, "CapabilityBoundingSet"))
+ return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error);
+
+ if (streq(name, "AmbientCapabilities"))
+ return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error);
+
+ if (streq(name, "RestrictNamespaces"))
+ return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
+
+ if (streq(name, "RestrictFileSystems")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ FilesystemParseFlags invert_flag = allow_list ? 0 : FILESYSTEM_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->restrict_filesystems_allow_list = false;
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+ unit_write_setting(u, flags, name, "RestrictFileSystems=");
+ return 1;
+ }
+
+ if (!c->restrict_filesystems)
+ c->restrict_filesystems_allow_list = allow_list;
+
+ STRV_FOREACH(s, l) {
+ r = lsm_bpf_parse_filesystem(
+ *s,
+ &c->restrict_filesystems,
+ FILESYSTEM_PARSE_LOG|
+ (invert_flag ? FILESYSTEM_PARSE_INVERT : 0)|
+ (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+ u->id, NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s%s", name, allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "MountFlags"))
+ return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error);
+
+ if (streq(name, "NetworkNamespacePath"))
+ return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error);
+
+ if (streq(name, "IPCNamespacePath"))
+ return bus_set_transient_path(u, name, &c->ipc_namespace_path, message, flags, error);
+
+ if (streq(name, "SupplementaryGroups")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!isempty(*p) && !valid_user_group_name(*p, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid supplementary group names");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->supplementary_groups = strv_free(c->supplementary_groups);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&c->supplementary_groups, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ joined = strv_join(c->supplementary_groups, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "SetCredential", "SetCredentialEncrypted")) {
+ bool isempty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(say)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *id;
+ const void *p;
+ size_t sz;
+
+ r = sd_bus_message_enter_container(message, 'r', "say");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = sd_bus_message_read(message, "s", &id);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!credential_name_valid(id))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+ isempty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *a = NULL, *b = NULL;
+ _cleanup_free_ void *copy = NULL;
+ ExecSetCredential *old;
+
+ copy = memdup(p, sz);
+ if (!copy)
+ return -ENOMEM;
+
+ old = hashmap_get(c->set_credentials, id);
+ if (old) {
+ free_and_replace(old->data, copy);
+ old->size = sz;
+ old->encrypted = streq(name, "SetCredentialEncrypted");
+ } else {
+ _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+ sc = new(ExecSetCredential, 1);
+ if (!sc)
+ return -ENOMEM;
+
+ *sc = (ExecSetCredential) {
+ .id = strdup(id),
+ .data = TAKE_PTR(copy),
+ .size = sz,
+ .encrypted = streq(name, "SetCredentialEncrypted"),
+ };
+
+ if (!sc->id)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(sc);
+ }
+
+ a = specifier_escape(id);
+ if (!a)
+ return -ENOMEM;
+
+ b = cescape_length(p, sz);
+ if (!b)
+ return -ENOMEM;
+
+ (void) unit_write_settingf(u, flags, name, "%s=%s:%s", name, a, b);
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+ c->set_credentials = hashmap_free(c->set_credentials);
+ (void) unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "LoadCredential", "LoadCredentialEncrypted")) {
+ bool isempty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *id, *source;
+
+ r = sd_bus_message_read(message, "(ss)", &id, &source);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (!credential_name_valid(id))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+ if (!(path_is_absolute(source) ? path_is_normalized(source) : credential_name_valid(source)))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential source is invalid: %s", source);
+
+ isempty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *copy = NULL;
+ ExecLoadCredential *old;
+
+ copy = strdup(source);
+ if (!copy)
+ return -ENOMEM;
+
+ old = hashmap_get(c->load_credentials, id);
+ if (old) {
+ free_and_replace(old->path, copy);
+ old->encrypted = streq(name, "LoadCredentialEncrypted");
+ } else {
+ _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+
+ lc = new(ExecLoadCredential, 1);
+ if (!lc)
+ return -ENOMEM;
+
+ *lc = (ExecLoadCredential) {
+ .id = strdup(id),
+ .path = TAKE_PTR(copy),
+ .encrypted = streq(name, "LoadCredentialEncrypted"),
+ };
+
+ if (!lc->id)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(&c->load_credentials, &exec_load_credential_hash_ops, lc->id, lc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(lc);
+ }
+
+ (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s:%s", name, id, source);
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+ c->load_credentials = hashmap_free(c->load_credentials);
+ (void) unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SyslogLevel")) {
+ int32_t level;
+
+ r = sd_bus_message_read(message, "i", &level);
+ if (r < 0)
+ return r;
+
+ if (!log_level_is_valid(level))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level;
+ unit_write_settingf(u, flags, name, "SyslogLevel=%i", level);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SyslogFacility")) {
+ int32_t facility;
+
+ r = sd_bus_message_read(message, "i", &facility);
+ if (r < 0)
+ return r;
+
+ if (!log_facility_unshifted_is_valid(facility))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority);
+ unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility);
+ }
+
+ return 1;
+
+ } else if (streq(name, "LogNamespace")) {
+ const char *n;
+
+ r = sd_bus_message_read(message, "s", &n);
+ if (r < 0)
+ return r;
+
+ if (!isempty(n) && !log_namespace_name_valid(n))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log namespace name not valid");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (isempty(n)) {
+ c->log_namespace = mfree(c->log_namespace);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ r = free_and_strdup(&c->log_namespace, n);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, n);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "LogExtraFields")) {
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "ay");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ void *copy = NULL;
+ struct iovec *t;
+ const char *eq;
+ const void *p;
+ size_t sz;
+
+ /* Note that we expect a byte array for each field, instead of a string. That's because on the
+ * lower-level journal fields can actually contain binary data and are not restricted to text,
+ * and we should not "lose precision" in our types on the way. That said, I am pretty sure
+ * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse
+ * any actual binary data, and only accept UTF-8. This allows us to eventually lift this
+ * limitation, should a good, valid usecase arise. */
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (memchr(p, 0, sz))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte");
+
+ eq = memchr(p, '=', sz);
+ if (!eq)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character");
+ if (!journal_field_valid(p, eq - (const char*) p, false))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+ if (!t)
+ return -ENOMEM;
+ c->log_extra_fields = t;
+ }
+
+ copy = malloc(sz + 1);
+ if (!copy)
+ return -ENOMEM;
+
+ memcpy(copy, p, sz);
+ ((uint8_t*) copy)[sz] = 0;
+
+ if (!utf8_is_valid(copy))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy);
+
+ copy = NULL;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) {
+ exec_context_free_log_extra_fields(c);
+ unit_write_setting(u, flags, name, "LogExtraFields=");
+ }
+
+ return 1;
+ }
+
+#if HAVE_SECCOMP
+
+ if (streq(name, "SystemCallErrorNumber"))
+ return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error);
+
+ if (streq(name, "SystemCallFilter")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->syscall_allow_list = false;
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+
+ unit_write_settingf(u, flags, name, "SystemCallFilter=");
+ return 1;
+ }
+
+ if (!c->syscall_filter) {
+ c->syscall_filter = hashmap_new(NULL);
+ if (!c->syscall_filter)
+ return log_oom();
+
+ c->syscall_allow_list = allow_list;
+
+ if (c->syscall_allow_list) {
+ r = seccomp_parse_syscall_filter("@default",
+ -1,
+ c->syscall_filter,
+ SECCOMP_PARSE_PERMISSIVE |
+ SECCOMP_PARSE_ALLOW_LIST,
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ STRV_FOREACH(s, l) {
+ _cleanup_free_ char *n = NULL;
+ int e;
+
+ r = parse_syscall_and_errno(*s, &n, &e);
+ if (r < 0)
+ return r;
+
+ if (allow_list && e >= 0)
+ return -EINVAL;
+
+ r = seccomp_parse_syscall_filter(n,
+ e,
+ c->syscall_filter,
+ SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+ invert_flag |
+ (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SystemCallLog")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->syscall_log_allow_list = false;
+ c->syscall_log = hashmap_free(c->syscall_log);
+
+ unit_write_settingf(u, flags, name, "SystemCallLog=");
+ return 1;
+ }
+
+ if (!c->syscall_log) {
+ c->syscall_log = hashmap_new(NULL);
+ if (!c->syscall_log)
+ return log_oom();
+
+ c->syscall_log_allow_list = allow_list;
+ }
+
+ STRV_FOREACH(s, l) {
+ r = seccomp_parse_syscall_filter(*s,
+ -1, /* errno not used */
+ c->syscall_log,
+ SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+ invert_flag |
+ (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "SystemCallLog=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SystemCallArchitectures")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l))
+ c->syscall_archs = set_free(c->syscall_archs);
+ else
+ STRV_FOREACH(s, l) {
+ uint32_t a;
+
+ r = seccomp_arch_from_string(*s, &a);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_put(&c->syscall_archs, NULL, UINT32_TO_PTR(a + 1));
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "RestrictAddressFamilies")) {
+ _cleanup_strv_free_ char **l = NULL;
+ int allow_list;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l)) {
+ c->address_families_allow_list = allow_list;
+ c->address_families = set_free(c->address_families);
+
+ unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s",
+ allow_list ? "none" : "");
+ return 1;
+ }
+
+ if (!c->address_families) {
+ c->address_families = set_new(NULL);
+ if (!c->address_families)
+ return log_oom();
+
+ c->address_families_allow_list = allow_list;
+ }
+
+ STRV_FOREACH(s, l) {
+ int af;
+
+ af = af_from_name(*s);
+ if (af < 0)
+ return af;
+
+ if (allow_list == c->address_families_allow_list) {
+ r = set_put(c->address_families, INT_TO_PTR(af));
+ if (r < 0)
+ return r;
+ } else
+ set_remove(c->address_families, INT_TO_PTR(af));
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+#endif
+ if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
+ const void *a;
+ size_t n;
+ bool affinity = streq(name, "CPUAffinity");
+ _cleanup_(cpu_set_reset) CPUSet set = {};
+
+ r = sd_bus_message_read_array(message, 'y', &a, &n);
+ if (r < 0)
+ return r;
+
+ r = cpu_set_from_dbus(a, n, &set);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n == 0) {
+ cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *str = NULL;
+
+ str = cpu_set_to_string(&set);
+ if (!str)
+ return -ENOMEM;
+
+ /* We forego any optimizations here, and always create the structure using
+ * cpu_set_add_all(), because we don't want to care if the existing size we
+ * got over dbus is appropriate. */
+ r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, str);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUAffinityFromNUMA")) {
+ int q;
+
+ r = sd_bus_message_read_basic(message, 'b', &q);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_affinity_from_numa = q;
+ unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa");
+ }
+
+ return 1;
+
+ } else if (streq(name, "NUMAPolicy")) {
+ int32_t type;
+
+ r = sd_bus_message_read(message, "i", &type);
+ if (r < 0)
+ return r;
+
+ if (!mpol_is_valid(type))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ c->numa_policy.type = type;
+
+ return 1;
+
+ } else if (streq(name, "Nice")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!nice_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->nice = q;
+ c->nice_set = true;
+
+ unit_write_settingf(u, flags, name, "Nice=%i", q);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUSchedulingPolicy")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!sched_policy_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = sched_policy_to_string_alloc(q, &s);
+ if (r < 0)
+ return r;
+
+ c->cpu_sched_policy = q;
+ c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q));
+ c->cpu_sched_set = true;
+
+ unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUSchedulingPriority")) {
+ int32_t p;
+
+ r = sd_bus_message_read(message, "i", &p);
+ if (r < 0)
+ return r;
+
+ /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set
+ * later so we do not check the precise range, but only the generic outer bounds. */
+ if (p < 0 || p > 99)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_sched_priority = p;
+ c->cpu_sched_set = true;
+
+ unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IOSchedulingClass")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!ioprio_class_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = ioprio_class_to_string_alloc(q, &s);
+ if (r < 0)
+ return r;
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(q, ioprio_prio_data(c->ioprio)));
+ c->ioprio_set = true;
+
+ unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IOSchedulingPriority")) {
+ int32_t p;
+
+ r = sd_bus_message_read(message, "i", &p);
+ if (r < 0)
+ return r;
+
+ if (!ioprio_priority_is_valid(p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), p));
+ c->ioprio_set = true;
+
+ unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p);
+ }
+
+ return 1;
+
+ } else if (streq(name, "MountAPIVFS")) {
+ bool b;
+
+ r = bus_set_transient_bool(u, name, &b, message, flags, error);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->mount_apivfs = b;
+ c->mount_apivfs_set = true;
+ }
+
+ return 1;
+
+ } else if (streq(name, "WorkingDirectory")) {
+ const char *s;
+ bool missing_ok;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (s[0] == '-') {
+ missing_ok = true;
+ s++;
+ } else
+ missing_ok = false;
+
+ if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (streq(s, "~")) {
+ c->working_directory = mfree(c->working_directory);
+ c->working_directory_home = true;
+ } else {
+ r = free_and_strdup(&c->working_directory, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->working_directory_home = false;
+ }
+
+ c->working_directory_missing_ok = missing_ok;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) {
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s) && !fdname_is_valid(s))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (streq(name, "StandardInputFileDescriptorName")) {
+ r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_input = EXEC_INPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO));
+
+ } else if (streq(name, "StandardOutputFileDescriptorName")) {
+ r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_output = EXEC_OUTPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO));
+
+ } else {
+ assert(streq(name, "StandardErrorFileDescriptorName"));
+
+ r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_error = EXEC_OUTPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO));
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "StandardInputFile",
+ "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate",
+ "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate")) {
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s)) {
+ if (!path_is_absolute(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s);
+ if (!path_is_normalized(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (streq(name, "StandardInputFile")) {
+ r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_input = EXEC_INPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s);
+
+ } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate")) {
+ r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ if (streq(name, "StandardOutputFile")) {
+ c->std_output = EXEC_OUTPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s);
+ } else if (streq(name, "StandardOutputFileToAppend")) {
+ c->std_output = EXEC_OUTPUT_FILE_APPEND;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s);
+ } else {
+ assert(streq(name, "StandardOutputFileToTruncate"));
+ c->std_output = EXEC_OUTPUT_FILE_TRUNCATE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=truncate:%s", s);
+ }
+ } else {
+ assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate"));
+
+ r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ if (streq(name, "StandardErrorFile")) {
+ c->std_error = EXEC_OUTPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s);
+ } else if (streq(name, "StandardErrorFileToAppend")) {
+ c->std_error = EXEC_OUTPUT_FILE_APPEND;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s);
+ } else {
+ assert(streq(name, "StandardErrorFileToTruncate"));
+ c->std_error = EXEC_OUTPUT_FILE_TRUNCATE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=truncate:%s", s);
+ }
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "StandardInputData")) {
+ const void *p;
+ size_t sz;
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (sz == 0) {
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+
+ unit_write_settingf(u, flags, name, "StandardInputData=");
+ } else {
+ void *q;
+ ssize_t n;
+
+ if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX)
+ return -E2BIG;
+
+ n = base64mem(p, sz, &encoded);
+ if (n < 0)
+ return (int) n;
+
+ q = realloc(c->stdin_data, c->stdin_data_size + sz);
+ if (!q)
+ return -ENOMEM;
+
+ memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+ c->stdin_data = q;
+ c->stdin_data_size += sz;
+
+ unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Environment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->environment = strv_free(c->environment);
+ unit_write_setting(u, flags, name, "Environment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ char **e;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+ if (!joined)
+ return -ENOMEM;
+
+ e = strv_env_merge(c->environment, l);
+ if (!e)
+ return -ENOMEM;
+
+ strv_free_and_replace(c->environment, e);
+ unit_write_settingf(u, flags, name, "Environment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "UnsetEnvironment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->unset_environment = strv_free(c->unset_environment);
+ unit_write_setting(u, flags, name, "UnsetEnvironment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ char **e;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+ if (!joined)
+ return -ENOMEM;
+
+ e = strv_env_merge(c->unset_environment, l);
+ if (!e)
+ return -ENOMEM;
+
+ strv_free_and_replace(c->unset_environment, e);
+ unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "OOMScoreAdjust")) {
+ int oa;
+
+ r = sd_bus_message_read(message, "i", &oa);
+ if (r < 0)
+ return r;
+
+ if (!oom_score_adjust_is_valid(oa))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->oom_score_adjust = oa;
+ c->oom_score_adjust_set = true;
+ unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CoredumpFilter")) {
+ uint64_t f;
+
+ r = sd_bus_message_read(message, "t", &f);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->coredump_filter = f;
+ c->coredump_filter_set = true;
+ unit_write_settingf(u, flags, name, "CoredumpFilter=0x%"PRIx64, f);
+ }
+
+ return 1;
+
+ } else if (streq(name, "EnvironmentFiles")) {
+
+ _cleanup_free_ char *joined = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ size_t size = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sb)");
+ if (r < 0)
+ return r;
+
+ f = open_memstream_unlocked(&joined, &size);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("EnvironmentFile=\n", f);
+
+ STRV_FOREACH(i, c->environment_files) {
+ _cleanup_free_ char *q = NULL;
+
+ q = specifier_escape(*i);
+ if (!q)
+ return -ENOMEM;
+
+ fprintf(f, "EnvironmentFile=%s\n", q);
+ }
+
+ while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) {
+ const char *path;
+ int b;
+
+ r = sd_bus_message_read(message, "sb", &path, &b);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *q = NULL, *buf = NULL;
+
+ buf = strjoin(b ? "-" : "", path);
+ if (!buf)
+ return -ENOMEM;
+
+ q = specifier_escape(buf);
+ if (!q)
+ return -ENOMEM;
+
+ fprintf(f, "EnvironmentFile=%s\n", q);
+
+ r = strv_consume(&l, TAKE_PTR(buf));
+ if (r < 0)
+ return r;
+ }
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->environment_files = strv_free(c->environment_files);
+ unit_write_setting(u, flags, name, "EnvironmentFile=");
+ } else {
+ r = strv_extend_strv(&c->environment_files, l, true);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "PassEnvironment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->pass_environment = strv_free(c->pass_environment);
+ unit_write_setting(u, flags, name, "PassEnvironment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&c->pass_environment, l, true);
+ if (r < 0)
+ return r;
+
+ /* We write just the new settings out to file, with unresolved specifiers. */
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories",
+ "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths", "ExecPaths", "NoExecPaths",
+ "ExtensionDirectories")) {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***dirs;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ char *i = *p;
+ size_t offset;
+
+ offset = i[0] == '-';
+ offset += i[offset] == '+';
+ if (!path_is_absolute(i + offset))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+ path_simplify(i + offset);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths"))
+ dirs = &c->read_write_paths;
+ else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths"))
+ dirs = &c->read_only_paths;
+ else if (streq(name, "ExecPaths"))
+ dirs = &c->exec_paths;
+ else if (streq(name, "NoExecPaths"))
+ dirs = &c->no_exec_paths;
+ else if (streq(name, "ExtensionDirectories"))
+ dirs = &c->extension_directories;
+ else /* "InaccessiblePaths" */
+ dirs = &c->inaccessible_paths;
+
+ if (strv_isempty(l)) {
+ *dirs = strv_free(*dirs);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ r = strv_extend_strv(dirs, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "ExecSearchPath")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!path_is_absolute(*p) || !path_is_normalized(*p) || strchr(*p, ':'))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->exec_search_path = strv_free(c->exec_search_path);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ r = strv_extend_strv(&c->exec_search_path, l, true);
+ if (r < 0)
+ return -ENOMEM;
+ joined = strv_join(c->exec_search_path, ":");
+ if (!joined)
+ return log_oom();
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ if (!path_is_normalized(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p);
+
+ if (path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p);
+
+ if (path_startswith(*p, "private"))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ ExecDirectoryType i;
+ ExecDirectory *d;
+
+ assert_se((i = exec_directory_type_from_string(name)) >= 0);
+ d = c->directories + i;
+
+ if (strv_isempty(l)) {
+ exec_directory_done(d);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ STRV_FOREACH(source, l) {
+ r = exec_directory_add(d, *source, NULL);
+ if (r < 0)
+ return log_oom();
+ }
+ exec_directory_sort(d);
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) {
+ int ignore;
+ const char *s;
+
+ r = sd_bus_message_read(message, "(bs)", &ignore, &s);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ char **p;
+ bool *b;
+
+ if (streq(name, "AppArmorProfile")) {
+ p = &c->apparmor_profile;
+ b = &c->apparmor_profile_ignore;
+ } else { /* "SmackProcessLabel" */
+ p = &c->smack_process_label;
+ b = &c->smack_process_label_ignore;
+ }
+
+ if (isempty(s)) {
+ *p = mfree(*p);
+ *b = false;
+ } else {
+ if (free_and_strdup(p, s) < 0)
+ return -ENOMEM;
+ *b = ignore;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) {
+ char *source, *destination;
+ int ignore_enoent;
+ uint64_t mount_flags;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ssbt)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) {
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+ if (!IN_SET(mount_flags, 0, MS_REC))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = source,
+ .destination = destination,
+ .read_only = !!strstr(name, "ReadOnly"),
+ .recursive = !!(mount_flags & MS_REC),
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s%s:%s:%s",
+ name,
+ ignore_enoent ? "-" : "",
+ source,
+ destination,
+ (mount_flags & MS_REC) ? "rbind" : "norbind");
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (empty) {
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (streq(name, "TemporaryFileSystem")) {
+ const char *path, *options;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) {
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s:%s",
+ name,
+ path,
+ options);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (empty) {
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if ((suffix = startswith(name, "Limit"))) {
+ const char *soft = NULL;
+ int ri;
+
+ ri = rlimit_from_string(suffix);
+ if (ri < 0) {
+ soft = endswith(suffix, "Soft");
+ if (soft) {
+ const char *n;
+
+ n = strndupa_safe(suffix, soft - suffix);
+ ri = rlimit_from_string(n);
+ if (ri >= 0)
+ name = strjoina("Limit", n);
+ }
+ }
+
+ if (ri >= 0) {
+ uint64_t rl;
+ rlim_t x;
+
+ r = sd_bus_message_read(message, "t", &rl);
+ if (r < 0)
+ return r;
+
+ if (rl == UINT64_MAX)
+ x = RLIM_INFINITY;
+ else {
+ x = (rlim_t) rl;
+
+ if ((uint64_t) x != rl)
+ return -ERANGE;
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *f = NULL;
+ struct rlimit nl;
+
+ if (c->rlimit[ri]) {
+ nl = *c->rlimit[ri];
+
+ if (soft)
+ nl.rlim_cur = x;
+ else
+ nl.rlim_max = x;
+ } else
+ /* When the resource limit is not initialized yet, then assign the value to both fields */
+ nl = (struct rlimit) {
+ .rlim_cur = x,
+ .rlim_max = x,
+ };
+
+ r = rlimit_format(&nl, &f);
+ if (r < 0)
+ return r;
+
+ if (c->rlimit[ri])
+ *c->rlimit[ri] = nl;
+ else {
+ c->rlimit[ri] = newdup(struct rlimit, &nl, 1);
+ if (!c->rlimit[ri])
+ return -ENOMEM;
+ }
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, f);
+ }
+
+ return 1;
+ }
+
+ } else if (streq(name, "MountImages")) {
+ _cleanup_free_ char *format_str = NULL;
+ MountImage *mount_images = NULL;
+ size_t n_mount_images = 0;
+ char *source, *destination;
+ int permissive;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ssba(ss))");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source_escaped = NULL, *destination_escaped = NULL;
+ char *tuple;
+
+ r = sd_bus_message_enter_container(message, 'r', "ssba(ss)");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssb", &source, &destination, &permissive);
+ if (r <= 0)
+ break;
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+ if (!path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+ if (!path_is_normalized(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = shell_escape(source, ":");
+ if (!source_escaped)
+ return -ENOMEM;
+ destination_escaped = shell_escape(destination, ":");
+ if (!destination_escaped)
+ return -ENOMEM;
+
+ tuple = strjoin(format_str,
+ format_str ? " " : "",
+ permissive ? "-" : "",
+ source_escaped,
+ ":",
+ destination_escaped);
+ if (!tuple)
+ return -ENOMEM;
+ free_and_replace(format_str, tuple);
+
+ r = bus_read_mount_options(message, error, &options, &format_str, ":");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = mount_image_add(&mount_images, &n_mount_images,
+ &(MountImage) {
+ .source = source,
+ .destination = destination,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_DISCRETE,
+ });
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n_mount_images == 0) {
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ for (size_t i = 0; i < n_mount_images; ++i) {
+ r = mount_image_add(&c->mount_images, &c->n_mount_images, &mount_images[i]);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+ name,
+ "%s=%s",
+ name,
+ format_str);
+ }
+ }
+
+ mount_images = mount_image_free_many(mount_images, &n_mount_images);
+
+ return 1;
+ } else if (streq(name, "ExtensionImages")) {
+ _cleanup_free_ char *format_str = NULL;
+ MountImage *extension_images = NULL;
+ size_t n_extension_images = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sba(ss))");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source_escaped = NULL;
+ char *source, *tuple;
+ int permissive;
+
+ r = sd_bus_message_enter_container(message, 'r', "sba(ss)");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sb", &source, &permissive);
+ if (r <= 0)
+ break;
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = shell_escape(source, ":");
+ if (!source_escaped)
+ return -ENOMEM;
+
+ tuple = strjoin(format_str,
+ format_str ? " " : "",
+ permissive ? "-" : "",
+ source_escaped);
+ if (!tuple)
+ return -ENOMEM;
+ free_and_replace(format_str, tuple);
+
+ r = bus_read_mount_options(message, error, &options, &format_str, ":");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = mount_image_add(&extension_images, &n_extension_images,
+ &(MountImage) {
+ .source = source,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_EXTENSION,
+ });
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n_extension_images == 0) {
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ for (size_t i = 0; i < n_extension_images; ++i) {
+ r = mount_image_add(&c->extension_images, &c->n_extension_images, &extension_images[i]);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+ name,
+ "%s=%s",
+ name,
+ format_str);
+ }
+ }
+
+ extension_images = mount_image_free_many(extension_images, &n_extension_images);
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "StateDirectorySymlink", "RuntimeDirectorySymlink", "CacheDirectorySymlink", "LogsDirectorySymlink")) {
+ char *source, *destination;
+ ExecDirectory *directory;
+ uint64_t symlink_flags; /* No flags for now, reserved for future uses. */
+ ExecDirectoryType i;
+
+ assert_se((i = exec_directory_type_symlink_from_string(name)) >= 0);
+ directory = c->directories + i;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(sst)", &source, &destination, &symlink_flags)) > 0) {
+ if (!path_is_valid(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not valid.", source);
+ if (path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+ if (!path_is_valid(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not valid.", destination);
+ if (path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is absolute.", destination);
+ if (!path_is_normalized(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+ if (symlink_flags != 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *destination_escaped = NULL, *source_escaped = NULL;
+
+ r = exec_directory_add(directory, source, destination);
+ if (r < 0)
+ return r;
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = xescape(source, ":");
+ destination_escaped = xescape(destination, ":");
+ if (!source_escaped || !destination_escaped)
+ return -ENOMEM;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, exec_directory_type_to_string(i),
+ "%s=%s:%s",
+ exec_directory_type_to_string(i),
+ source_escaped,
+ destination_escaped);
+ }
+ }
+ if (r < 0)
+ return r;
+
+ exec_directory_sort(directory);
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ }
+
+ return 0;
+}
diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h
new file mode 100644
index 0000000..c538341
--- /dev/null
+++ b/src/core/dbus-execute.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "execute.h"
+
+#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+ SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
+ SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
+ SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
+
+#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags)
+
+#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags)
+
+#define BUS_EXEC_EX_COMMAND_LIST_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasasttttuii)", bus_property_get_exec_ex_command_list, offset, flags)
+
+extern const sd_bus_vtable bus_exec_vtable[];
+
+int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_ex_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c
new file mode 100644
index 0000000..9792a5c
--- /dev/null
+++ b/src/core/dbus-job.c
@@ -0,0 +1,371 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "dbus-job.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "job.h"
+#include "log.h"
+#include "selinux-access.h"
+#include "string-util.h"
+#include "strv.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState);
+
+static int property_get_unit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ p = unit_dbus_path(j->unit);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_message_append(reply, "(so)", j->unit->id, p);
+}
+
+int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(j->unit, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ /* Access is granted to the job owner */
+ if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) {
+
+ /* And for everybody else consult polkit */
+ r = bus_verify_manage_units_async(j->unit->manager, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+ }
+
+ job_finish_and_invalidate(j, JOB_CANCELED, true, false);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_free_ Job **list = NULL;
+ Job *j = userdata;
+ int r, n;
+
+ if (strstr(sd_bus_message_get_member(message), "After"))
+ n = job_get_after(j, &list);
+ else
+ n = job_get_before(j, &list);
+ if (n < 0)
+ return n;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+ if (r < 0)
+ return r;
+
+ for (int i = 0; i < n; i ++) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+ job_path = job_dbus_path(list[i]);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(list[i]->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "(usssoo)",
+ list[i]->id,
+ list[i]->unit->id,
+ job_type_to_string(list[i]->type),
+ job_state_to_string(list[i]->state),
+ job_path,
+ unit_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+const sd_bus_vtable bus_job_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetAfter",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ bus_job_method_get_waiting_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetBefore",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ bus_job_method_get_waiting_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Job, activation_details), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = manager_get_job_from_dbus_path(m, path, &j);
+ if (r < 0)
+ return 0;
+
+ *found = j;
+ return 1;
+}
+
+static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = userdata;
+ unsigned k = 0;
+ Job *j;
+
+ l = new0(char*, hashmap_size(m->jobs)+1);
+ if (!l)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(j, m->jobs) {
+ l[k] = job_dbus_path(j);
+ if (!l[k])
+ return -ENOMEM;
+
+ k++;
+ }
+
+ assert(hashmap_size(m->jobs) == k);
+
+ *nodes = TAKE_PTR(l);
+
+ return k;
+}
+
+const BusObjectImplementation job_object = {
+ "/org/freedesktop/systemd1/job",
+ "org.freedesktop.systemd1.Job",
+ .fallback_vtables = BUS_FALLBACK_VTABLES({bus_job_vtable, bus_job_find}),
+ .node_enumerator = bus_job_enumerate,
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "JobNew");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL);
+}
+
+void bus_job_send_change_signal(Job *j) {
+ int r;
+
+ assert(j);
+
+ /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+ bus_unit_send_pending_change_signal(j->unit, true);
+
+ if (j->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = false;
+ }
+
+ r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id);
+
+ j->sent_dbus_new_signal = true;
+}
+
+void bus_job_send_pending_change_signal(Job *j, bool including_new) {
+ assert(j);
+
+ if (!j->in_dbus_queue)
+ return;
+
+ if (!j->sent_dbus_new_signal && !including_new)
+ return;
+
+ if (MANAGER_IS_RELOADING(j->unit->manager))
+ return;
+
+ bus_job_send_change_signal(j);
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "JobRemoved");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result));
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+void bus_job_send_removed_signal(Job *j) {
+ int r;
+
+ assert(j);
+
+ if (!j->sent_dbus_new_signal)
+ bus_job_send_change_signal(j);
+
+ /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+ bus_unit_send_pending_change_signal(j->unit, true);
+
+ r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id);
+}
+
+static int bus_job_track_handler(sd_bus_track *t, void *userdata) {
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(t);
+
+ j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */
+
+ /* Last client dropped off the bus, maybe we should GC this now? */
+ job_add_to_gc_queue(j);
+ return 0;
+}
+
+static int bus_job_allocate_bus_track(Job *j) {
+
+ assert(j);
+
+ if (j->bus_track)
+ return 0;
+
+ return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j);
+}
+
+int bus_job_coldplug_bus_track(Job *j) {
+ int r;
+ _cleanup_strv_free_ char **deserialized_clients = NULL;
+
+ assert(j);
+
+ deserialized_clients = TAKE_PTR(j->deserialized_clients);
+
+ if (strv_isempty(deserialized_clients))
+ return 0;
+
+ if (!j->manager->api_bus)
+ return 0;
+
+ r = bus_job_allocate_bus_track(j);
+ if (r < 0)
+ return r;
+
+ return bus_track_add_name_many(j->bus_track, deserialized_clients);
+}
+
+int bus_job_track_sender(Job *j, sd_bus_message *m) {
+ int r;
+
+ assert(j);
+ assert(m);
+
+ if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) {
+ j->ref_by_private_bus = true;
+ return 0;
+ }
+
+ r = bus_job_allocate_bus_track(j);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_sender(j->bus_track, m);
+}
diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h
new file mode 100644
index 0000000..6f00581
--- /dev/null
+++ b/src/core/dbus-job.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+#include "bus-object.h"
+
+extern const sd_bus_vtable bus_job_vtable[];
+extern const BusObjectImplementation job_object;
+
+int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error);
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+void bus_job_send_change_signal(Job *j);
+void bus_job_send_pending_change_signal(Job *j, bool including_new);
+void bus_job_send_removed_signal(Job *j);
+
+int bus_job_coldplug_bus_track(Job *j);
+int bus_job_track_sender(Job *j, sd_bus_message *m);
diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c
new file mode 100644
index 0000000..19e439f
--- /dev/null
+++ b/src/core/dbus-kill.c
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-kill.h"
+#include "dbus-util.h"
+#include "kill.h"
+#include "signal-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode);
+
+static int property_get_restart_kill_signal(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ KillContext *c = ASSERT_PTR(userdata);
+ int s;
+
+ s = restart_kill_signal(c);
+ return sd_bus_message_append_basic(reply, 'i', &s);
+}
+
+const sd_bus_vtable bus_kill_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartKillSignal", "i", property_get_restart_kill_signal, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool, offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(restart_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+
+int bus_kill_context_set_transient_property(
+ Unit *u,
+ KillContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "KillMode"))
+ return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error);
+
+ if (streq(name, "SendSIGHUP"))
+ return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error);
+
+ if (streq(name, "SendSIGKILL"))
+ return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error);
+
+ if (streq(name, "KillSignal"))
+ return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error);
+
+ if (streq(name, "RestartKillSignal"))
+ return bus_set_transient_restart_kill_signal(u, name, &c->restart_kill_signal, message, flags, error);
+
+ if (streq(name, "FinalKillSignal"))
+ return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error);
+
+ if (streq(name, "WatchdogSignal"))
+ return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h
new file mode 100644
index 0000000..5a90287
--- /dev/null
+++ b/src/core/dbus-kill.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "kill.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_kill_vtable[];
+
+int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c
new file mode 100644
index 0000000..6fa483e
--- /dev/null
+++ b/src/core/dbus-manager.c
@@ -0,0 +1,3440 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/prctl.h>
+#include <sys/statvfs.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "build.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "chase-symlinks.h"
+#include "data-fd-util.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "install.h"
+#include "log.h"
+#include "manager-dump.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "user-util.h"
+#include "virt.h"
+#include "watchdog.h"
+
+/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state
+ * there, and if we can't we'll fail badly. */
+#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024))
+
+static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) {
+ return (runtime ? UNIT_FILE_RUNTIME : 0) |
+ (force ? UNIT_FILE_FORCE : 0);
+}
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_emergency_action, emergency_action, EmergencyAction);
+
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", systemd_features);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture()));
+static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK));
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size);
+static BUS_DEFINE_PROPERTY_GET(property_get_default_timeout_abort_usec, "t", Manager, manager_default_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_device, "s", watchdog_get_device());
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_realtime, "t", watchdog_get_last_ping(CLOCK_REALTIME));
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_monotonic, "t", watchdog_get_last_ping(CLOCK_MONOTONIC));
+
+static int property_get_virtualization(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Virtualization v;
+
+ assert(bus);
+ assert(reply);
+
+ v = detect_virtualization();
+
+ /* Make sure to return the empty string when we detect no virtualization, as that is the API.
+ *
+ * https://github.com/systemd/systemd/issues/1423
+ */
+
+ return sd_bus_message_append(
+ reply, "s",
+ v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v));
+}
+
+static int property_get_tainted(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *s = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ s = manager_taint_string(m);
+ if (!s)
+ return log_oom();
+
+ return sd_bus_message_append(reply, "s", s);
+}
+
+static int property_set_log_target(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = userdata;
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (isempty(t))
+ manager_restore_original_log_target(m);
+ else {
+ LogTarget target;
+
+ target = log_target_from_string(t);
+ if (target < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t);
+
+ manager_override_log_target(m, target);
+ }
+
+ return 0;
+}
+
+static int property_set_log_level(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = userdata;
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (isempty(t))
+ manager_restore_original_log_level(m);
+ else {
+ int level;
+
+ level = log_level_from_string(t);
+ if (level < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t);
+
+ manager_override_log_level(m, level);
+ }
+
+ return 0;
+}
+
+static int property_get_progress(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ double d;
+
+ assert(bus);
+ assert(reply);
+
+ if (MANAGER_IS_FINISHED(m))
+ d = 1.0;
+ else
+ d = 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs);
+
+ return sd_bus_message_append(reply, "d", d);
+}
+
+static int property_get_environment(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = manager_get_effective_environment(m, &l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_append_strv(reply, l);
+}
+
+static int property_get_show_status(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "b", manager_get_show_status_on(m));
+}
+
+static int property_get_runtime_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_RUNTIME));
+}
+
+static int property_get_pretimeout_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_PRETIMEOUT));
+}
+
+static int property_get_pretimeout_watchdog_governor(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "s", m->watchdog_pretimeout_governor);
+}
+
+static int property_get_reboot_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_REBOOT));
+}
+
+static int property_get_kexec_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_KEXEC));
+}
+
+static int property_set_watchdog(Manager *m, WatchdogType type, sd_bus_message *value) {
+ usec_t timeout;
+ int r;
+
+ assert(m);
+ assert(value);
+
+ assert_cc(sizeof(usec_t) == sizeof(uint64_t));
+
+ r = sd_bus_message_read(value, "t", &timeout);
+ if (r < 0)
+ return r;
+
+ manager_override_watchdog(m, type, timeout);
+ return 0;
+}
+
+static int property_set_runtime_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_RUNTIME, value);
+}
+
+static int property_set_pretimeout_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_PRETIMEOUT, value);
+}
+
+static int property_set_pretimeout_watchdog_governor(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ char *governor;
+ int r;
+
+ r = sd_bus_message_read(value, "s", &governor);
+ if (r < 0)
+ return r;
+ if (!string_is_safe(governor))
+ return -EINVAL;
+
+ return manager_override_watchdog_pretimeout_governor(m, governor);
+}
+
+static int property_set_reboot_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_REBOOT, value);
+}
+
+static int property_set_kexec_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _unused_ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(value);
+
+ return property_set_watchdog(userdata, WATCHDOG_KEXEC, value);
+}
+
+static int property_get_oom_score_adjust(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ int r, n;
+
+ assert(bus);
+ assert(reply);
+
+ if (m->default_oom_score_adjust_set)
+ n = m->default_oom_score_adjust;
+ else {
+ n = 0;
+ r = get_oom_score_adjust(&n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read current OOM score adjustment value, ignoring: %m");
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(message);
+ assert(ret_unit);
+
+ /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up
+ * its sleeve: if the name is specified empty we use the client's unit. */
+
+ if (isempty(name)) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
+ } else {
+ u = manager_get_unit(m, name);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name);
+ }
+
+ *ret_unit = u;
+ return 0;
+}
+
+static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+ assert(m);
+ assert(message);
+ assert(ret_unit);
+
+ /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */
+
+ if (isempty(name))
+ return bus_get_unit_by_name(m, message, name, ret_unit, error);
+
+ return manager_load_unit(m, name, NULL, error, ret_unit);
+}
+
+static int reply_unit_path(Unit *u, sd_bus_message *message, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(u);
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ path = unit_dbus_path(u);
+ if (!path)
+ return log_oom();
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ pid_t pid;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(pid_t) == sizeof(uint32_t));
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "u", &pid);
+ if (r < 0)
+ return r;
+ if (pid < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid);
+
+ if (pid == 0) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ }
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid);
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_id128_t id;
+ const void *a;
+ Unit *u;
+ size_t sz;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read_array(message, 'y', &a, &sz);
+ if (r < 0)
+ return r;
+ if (sz == 0)
+ id = SD_ID128_NULL;
+ else if (sz == 16)
+ memcpy(&id, a, sz);
+ else
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID");
+
+ if (sd_id128_is_null(id)) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Client " PID_FMT " not member of any unit.", pid);
+ } else {
+ u = hashmap_get(m->units_by_invocation_id, &id);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id));
+ }
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ /* So here's a special trick: the bus path we return actually references the unit by its invocation
+ * ID instead of the unit name. This means it stays valid only as long as the invocation ID stays the
+ * same. */
+ path = unit_dbus_path_invocation_id(u);
+ if (!path)
+ return -ENOMEM;
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = userdata;
+ const char *cgroup;
+ Unit *u;
+ int r;
+
+ r = sd_bus_message_read(message, "s", &cgroup);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_cgroup(m, cgroup);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Control group '%s' is not valid or not managed by this instance",
+ cgroup);
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_load_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) {
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = manager_load_unit(m, name, NULL, error, &u);
+ if (r < 0)
+ return r;
+
+ return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error);
+}
+
+static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_START, false, error);
+}
+
+static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_STOP, false, error);
+}
+
+static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RELOAD, false, error);
+}
+
+static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RESTART, false, error);
+}
+
+static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, false, error);
+}
+
+static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RESTART, true, error);
+}
+
+static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, true, error);
+}
+
+typedef enum GenericUnitOperationFlags {
+ GENERIC_UNIT_LOAD = 1 << 0, /* Load if the unit is not loaded yet */
+ GENERIC_UNIT_VALIDATE_LOADED = 1 << 1, /* Verify unit is properly loaded before forwarding call */
+} GenericUnitOperationFlags;
+
+static int method_generic_unit_operation(
+ sd_bus_message *message,
+ Manager *m,
+ sd_bus_error *error,
+ sd_bus_message_handler_t handler,
+ GenericUnitOperationFlags flags) {
+
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ /* Read the first argument from the command and pass the operation to the specified per-unit
+ * method. */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ if (!isempty(name) && FLAGS_SET(flags, GENERIC_UNIT_LOAD))
+ r = manager_load_unit(m, name, NULL, error, &u);
+ else
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, GENERIC_UNIT_VALIDATE_LOADED)) {
+ r = bus_unit_validate_load_state(u, error);
+ if (r < 0)
+ return r;
+ }
+
+ return handler(message, u, error);
+}
+
+static int method_enqueue_unit_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* We don't bother with GENERIC_UNIT_VALIDATE_LOADED here, as the job logic validates that anyway */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_enqueue_job, GENERIC_UNIT_LOAD);
+}
+
+static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *old_name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &old_name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, old_name, &u, error);
+ if (r < 0)
+ return r;
+ if (!u->job || u->job->type != JOB_START)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name);
+
+ return method_start_unit_generic(message, m, JOB_START, false, error);
+}
+
+static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* We don't bother with GENERIC_UNIT_LOAD nor GENERIC_UNIT_VALIDATE_LOADED here, as it shouldn't
+ * matter whether a unit is loaded for killing any processes possibly in the unit's cgroup. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_kill, 0);
+}
+
+static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Load the unit if necessary, in order to load it, and insist on the unit being loaded to be
+ * cleaned */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_clean, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0);
+}
+
+static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0);
+}
+
+static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't load the unit (because unloaded units can't be in failed state), and don't insist on the
+ * unit to be loaded properly (since a failed unit might have its unit file disappeared) */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_reset_failed, 0);
+}
+
+static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only change properties on fully loaded units, and load them in order to set properties */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_bind_mount_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only add mounts on fully loaded units */
+ return method_generic_unit_operation(message, userdata, error, bus_service_method_bind_mount, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_mount_image_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only add mounts on fully loaded units */
+ return method_generic_unit_operation(message, userdata, error, bus_service_method_mount_image, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Dropping a ref OTOH should not require the unit to still be loaded. And since a reffed unit is a
+ * loaded unit there's no need to load the unit for unreffing it. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_unref, 0);
+}
+
+static int reply_unit_info(sd_bus_message *reply, Unit *u) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+ Unit *following;
+
+ following = unit_following(u);
+
+ unit_path = unit_dbus_path(u);
+ if (!unit_path)
+ return -ENOMEM;
+
+ if (u->job) {
+ job_path = job_dbus_path(u->job);
+ if (!job_path)
+ return -ENOMEM;
+ }
+
+ return sd_bus_message_append(
+ reply, "(ssssssouso)",
+ u->id,
+ unit_description(u),
+ unit_load_state_to_string(u->load_state),
+ unit_active_state_to_string(unit_active_state(u)),
+ unit_sub_state_to_string(u),
+ following ? following->id : "",
+ unit_path,
+ u->job ? u->job->id : 0,
+ u->job ? job_type_to_string(u->job->type) : "",
+ empty_to_root(job_path));
+}
+
+static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+ _cleanup_strv_free_ char **units = NULL;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &units);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(unit, units) {
+ Unit *u;
+
+ if (!unit_name_is_valid(*unit, UNIT_NAME_ANY))
+ continue;
+
+ r = bus_load_unit_by_name(m, message, *unit, &u, error);
+ if (r < 0)
+ return r;
+
+ r = reply_unit_info(reply, u);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the
+ * unit being loaded (because even improperly loaded units might still have processes around */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0);
+}
+
+static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't allow attaching new processes to units that aren't loaded. Don't bother with loading a unit
+ * for this purpose though, as an unloaded unit is a stopped unit, and we don't allow attaching
+ * processes to stopped units anyway. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int transient_unit_from_message(
+ Manager *m,
+ sd_bus_message *message,
+ const char *name,
+ Unit **unit,
+ sd_bus_error *error) {
+
+ UnitType t;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(message);
+ assert(name);
+
+ t = unit_name_to_type(name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid unit name or type.");
+
+ if (!unit_vtable[t]->can_transient)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit type %s does not support transient units.",
+ unit_type_to_string(t));
+
+ r = manager_load_unit(m, name, NULL, error, &u);
+ if (r < 0)
+ return r;
+
+ if (!unit_is_pristine(u))
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "Unit %s was already loaded or has a fragment file.", name);
+
+ /* OK, the unit failed to load and is unreferenced, now let's
+ * fill in the transient data instead */
+ r = unit_make_transient(u);
+ if (r < 0)
+ return r;
+
+ /* Set our properties */
+ r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error);
+ if (r < 0)
+ return r;
+
+ /* If the client asked for it, automatically add a reference to this unit. */
+ if (u->bus_track_add) {
+ r = bus_unit_track_add_sender(u, message);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch sender: %m");
+ }
+
+ /* Now load the missing bits of the unit we just created */
+ unit_add_to_load_queue(u);
+ manager_dispatch_load_queue(m);
+
+ *unit = u;
+
+ return 0;
+}
+
+static int transient_aux_units_from_message(
+ Manager *m,
+ sd_bus_message *message,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(m);
+ assert(message);
+
+ r = sd_bus_message_enter_container(message, 'a', "(sa(sv))");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) {
+ const char *name = NULL;
+ Unit *u;
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = transient_unit_from_message(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const char *name, *smode;
+ Manager *m = ASSERT_PTR(userdata);
+ JobMode mode;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ss", &name, &smode);
+ if (r < 0)
+ return r;
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode);
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = transient_unit_from_message(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ r = transient_aux_units_from_message(m, message, error);
+ if (r < 0)
+ return r;
+
+ /* Finally, start it */
+ return bus_unit_queue_job(message, u, JOB_START, mode, 0, error);
+}
+
+static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ r = mac_selinux_unit_access_check(j->unit, message, "status", error);
+ if (r < 0)
+ return r;
+
+ path = job_dbus_path(j);
+ if (!path)
+ return -ENOMEM;
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ return bus_job_method_cancel(message, j, error);
+}
+
+static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ manager_clear_jobs(m);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ manager_reset_failed(m);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *k;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ if (k != u->id)
+ continue;
+
+ if (!strv_isempty(states) &&
+ !strv_contains(states, unit_load_state_to_string(u->load_state)) &&
+ !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) &&
+ !strv_contains(states, unit_sub_state_to_string(u)))
+ continue;
+
+ if (!strv_isempty(patterns) &&
+ !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+ continue;
+
+ r = reply_unit_info(reply, u);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return list_units_filtered(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ return list_units_filtered(message, userdata, error, states, NULL);
+}
+
+static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return list_units_filtered(message, userdata, error, states, patterns);
+}
+
+static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(j, m->jobs) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+ job_path = job_dbus_path(j);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(j->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(
+ reply, "(usssoo)",
+ j->id,
+ j->unit->id,
+ job_type_to_string(j->type),
+ job_state_to_string(j->state),
+ job_path,
+ unit_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_get_bus(message) == m->api_bus) {
+
+ /* Note that direct bus connection subscribe by
+ * default, we only track peers on the API bus here */
+
+ if (!m->subscribed) {
+ r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_track_add_sender(m->subscribed, message);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return sd_bus_error_set(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed.");
+ }
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_get_bus(message) == m->api_bus) {
+ r = sd_bus_track_remove_sender(m->subscribed, message);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return sd_bus_error_set(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed.");
+ }
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int dump_impl(
+ sd_bus_message *message,
+ void *userdata,
+ sd_bus_error *error,
+ char **patterns,
+ int (*reply)(sd_bus_message *, char *)) {
+
+ _cleanup_free_ char *dump = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* 'status' access is the bare minimum always needed for this, as the policy might straight out
+ * forbid a client from querying any information from systemd, regardless of any rate limiting. */
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ /* Rate limit reached? Check if the caller is privileged/allowed by policy to bypass this. We
+ * check the rate limit first to avoid the expensive roundtrip to polkit when not needed. */
+ if (!ratelimit_below(&m->dump_ratelimit)) {
+ /* We need a way for SELinux to constrain the operation when the rate limit is active, even
+ * if polkit would allow it, but we cannot easily add new named permissions, so we need to
+ * use an existing one. Reload/reexec are also slow but non-destructive/modifying
+ * operations, and can cause PID1 to stall. So it seems similar enough in terms of security
+ * considerations and impact, and thus use the same access check for dumps which, given the
+ * large amount of data to fetch, can stall PID1 for quite some time. */
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ goto ratelimited;
+
+ r = bus_verify_bypass_dump_ratelimit_async(m, message, error);
+ if (r < 0)
+ goto ratelimited;
+ if (r == 0)
+ /* No authorization for now, but the async polkit stuff will call us again when it
+ * has it */
+ return 1;
+ }
+
+ r = manager_get_dump_string(m, patterns, &dump);
+ if (r < 0)
+ return r;
+
+ return reply(message, dump);
+
+ratelimited:
+ log_warning("Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+ FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+ return sd_bus_error_setf(error,
+ SD_BUS_ERROR_LIMITS_EXCEEDED,
+ "Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+ FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+}
+
+static int reply_dump(sd_bus_message *message, char *dump) {
+ return sd_bus_reply_method_return(message, "s", dump);
+}
+
+static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_impl(message, userdata, error, NULL, reply_dump);
+}
+
+static int reply_dump_by_fd(sd_bus_message *message, char *dump) {
+ _cleanup_close_ int fd = -1;
+
+ fd = acquire_data_fd(dump, strlen(dump), 0);
+ if (fd < 0)
+ return fd;
+
+ return sd_bus_reply_method_return(message, "h", fd);
+}
+
+static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_impl(message, userdata, error, NULL, reply_dump_by_fd);
+}
+
+static int dump_units_matching_patterns(
+ sd_bus_message *message,
+ void *userdata,
+ sd_bus_error *error,
+ int (*reply)(sd_bus_message *, char *)) {
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return dump_impl(message, userdata, error, patterns, reply);
+}
+
+static int method_dump_units_matching_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_units_matching_patterns(message, userdata, error, reply_dump);
+}
+
+static int method_dump_units_matching_patterns_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_units_matching_patterns(message, userdata, error, reply_dump_by_fd);
+}
+
+static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed.");
+}
+
+static int verify_run_space(const char *message, sd_bus_error *error) {
+ struct statvfs svfs;
+ uint64_t available;
+
+ if (statvfs("/run/systemd", &svfs) < 0)
+ return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m");
+
+ available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize;
+
+ if (available < RELOAD_DISK_SPACE_MIN)
+ return sd_bus_error_setf(error,
+ BUS_ERROR_DISK_FULL,
+ "%s, not enough space available on /run/systemd. "
+ "Currently, %s are free, but a safety buffer of %s is enforced.",
+ message,
+ FORMAT_BYTES(available),
+ FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+ return 0;
+}
+
+int verify_run_space_and_log(const char *message) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ r = verify_run_space(message, &error);
+ if (r < 0)
+ return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+ return 0;
+}
+
+static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = verify_run_space("Refusing to reload", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_reload_daemon_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ /* Instead of sending the reply back right away, we just
+ * remember that we need to and then send it after the reload
+ * is finished. That way the caller knows when the reload
+ * finished. */
+
+ assert(!m->pending_reload_message);
+ r = sd_bus_message_new_method_return(message, &m->pending_reload_message);
+ if (r < 0)
+ return r;
+
+ m->objective = MANAGER_RELOAD;
+
+ return 1;
+}
+
+static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = verify_run_space("Refusing to reexecute", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_reload_daemon_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ /* We don't send a reply back here, the client should
+ * just wait for us disconnecting. */
+
+ m->objective = MANAGER_REEXECUTE;
+ return 1;
+}
+
+static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ /* Exit() (in contrast to SetExitCode()) is actually allowed even if
+ * we are running on the host. It will fall back on reboot() in
+ * systemd-shutdown if it cannot do the exit() because it isn't a
+ * container. */
+
+ m->objective = MANAGER_EXIT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Reboot is only supported for system managers.");
+
+ m->objective = MANAGER_REBOOT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Powering off is only supported for system managers.");
+
+ m->objective = MANAGER_POWEROFF;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Halt is only supported for system managers.");
+
+ m->objective = MANAGER_HALT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "KExec is only supported for system managers.");
+
+ m->objective = MANAGER_KEXEC;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *ri = NULL, *rt = NULL;
+ const char *root, *init;
+ Manager *m = ASSERT_PTR(userdata);
+ struct statvfs svfs;
+ uint64_t available;
+ int r;
+
+ assert(message);
+
+ if (statvfs("/run/systemd", &svfs) < 0)
+ return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m");
+
+ available = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize;
+
+ if (available < RELOAD_DISK_SPACE_MIN)
+ log_warning("Dangerously low amount of free space on /run/systemd, root switching might fail.\n"
+ "Currently, %s are free, but %s are suggested. Proceeding anyway.",
+ FORMAT_BYTES(available),
+ FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Root switching is only supported by system manager.");
+
+ r = sd_bus_message_read(message, "ss", &root, &init);
+ if (r < 0)
+ return r;
+
+ if (isempty(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory may not be the empty string.");
+ if (!path_is_absolute(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root path '%s' is not absolute.", root);
+ if (path_equal(root, "/"))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory cannot be the old root directory.");
+
+ /* Safety check */
+ if (isempty(init)) {
+ r = path_is_os_tree(root);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r,
+ "Failed to determine whether root path '%s' contains an OS tree: %m",
+ root);
+ if (r == 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.",
+ root);
+ } else {
+ _cleanup_free_ char *chased = NULL;
+
+ if (!path_is_absolute(init))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Path to init binary '%s' not absolute.", init);
+
+ r = chase_symlinks(init, root, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &chased, NULL);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r,
+ "Could not resolve init executable %s: %m", init);
+
+ if (laccess(chased, X_OK) < 0) {
+ if (errno == EACCES)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Init binary %s is not executable.", init);
+
+ return sd_bus_error_set_errnof(error, r,
+ "Could not check whether init binary %s is executable: %m", init);
+ }
+ }
+
+ rt = strdup(root);
+ if (!rt)
+ return -ENOMEM;
+
+ if (!isempty(init)) {
+ ri = strdup(init);
+ if (!ri)
+ return -ENOMEM;
+ }
+
+ free_and_replace(m->switch_root, rt);
+ free_and_replace(m->switch_root_init, ri);
+
+ m->objective = MANAGER_SWITCH_ROOT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **plus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &plus);
+ if (r < 0)
+ return r;
+ if (!strv_env_is_valid(plus))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, NULL, plus);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **minus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &minus);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(minus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment variable names or assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, minus, NULL);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **minus = NULL, **plus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &minus);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &plus);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(minus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment variable names or assignments");
+ if (!strv_env_is_valid(plus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, minus, plus);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint8_t code;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "exit", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_basic(message, 'y', &code);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0)
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "ExitCode can only be set for user service managers or in containers.");
+
+ m->return_value = code;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ uid_t uid;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read_basic(message, 's', &name);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "User name invalid: %s", name);
+
+ r = dynamic_user_lookup_name(m, name, &uid);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+ "Dynamic user %s does not exist.", name);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "u", (uint32_t) uid);
+}
+
+static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *name = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ uid_t uid;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+ r = sd_bus_message_read_basic(message, 'u', &uid);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+ if (!uid_is_valid(uid))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "User ID invalid: " UID_FMT, uid);
+
+ r = dynamic_user_lookup_uid(m, uid, &name);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+ "Dynamic user ID " UID_FMT " does not exist.", uid);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", name);
+}
+
+static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ DynamicUser *d;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(us)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ uid_t uid;
+
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN) /* not realized yet? */
+ continue;
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED,
+ "Failed to look up a dynamic user.");
+
+ r = sd_bus_message_append(reply, "(us)", uid, d->name);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_enqueue_marked_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ log_info("Queuing reload/restart jobs for marked units%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "o");
+ if (r < 0)
+ return r;
+
+ Unit *u;
+ char *k;
+ int ret = 0;
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ BusUnitQueueFlags flags;
+ if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RESTART))
+ flags = 0;
+ else if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD))
+ flags = BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ else
+ continue;
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r >= 0)
+ r = bus_unit_queue_job_one(message, u,
+ JOB_TRY_RESTART, JOB_FAIL, flags,
+ reply, error);
+ if (r < 0) {
+ if (ERRNO_IS_RESOURCE(r))
+ return r;
+ if (ret >= 0)
+ ret = r;
+ sd_bus_error_free(error);
+ }
+ }
+
+ if (ret < 0)
+ return sd_bus_error_set_errnof(error, ret,
+ "Failed to enqueue some jobs, see logs for details: %m");
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFileList *item;
+ Hashmap *h;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ h = hashmap_new(&string_hash_ops);
+ if (!h)
+ return -ENOMEM;
+
+ r = unit_file_get_list(m->unit_file_scope, NULL, h, states, patterns);
+ if (r < 0)
+ goto fail;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ goto fail;
+
+ HASHMAP_FOREACH(item, h) {
+
+ r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state));
+ if (r < 0)
+ goto fail;
+ }
+
+ unit_file_list_free(h);
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+
+fail:
+ unit_file_list_free(h);
+ return r;
+}
+
+static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return list_unit_files_by_patterns(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return list_unit_files_by_patterns(message, userdata, error, states, patterns);
+}
+
+static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ UnitFileState state;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = unit_file_get_state(m->unit_file_scope, NULL, name, &state);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state));
+}
+
+static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *default_target = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = unit_file_get_default(m->unit_file_scope, NULL, &default_target);
+ if (r == -ERFKILL)
+ sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit file is masked.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", default_target);
+}
+
+static int send_unit_files_changed(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus, &message,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitFilesChanged");
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+/* Create an error reply, using the error information from changes[]
+ * if possible, and fall back to generating an error from error code c.
+ * The error message only describes the first error.
+ */
+static int install_error(
+ sd_bus_error *error,
+ int c,
+ InstallChange *changes,
+ size_t n_changes) {
+
+ int r;
+
+ for (size_t i = 0; i < n_changes; i++)
+
+ /* When making changes here, make sure to also change install_changes_dump() in install.c. */
+
+ switch (changes[i].type) {
+ case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */
+ break;
+
+ case -EEXIST:
+ if (changes[i].source)
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "File %s already exists and is a symlink to %s.",
+ changes[i].path, changes[i].source);
+ else
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "File %s already exists.",
+ changes[i].path);
+ goto found;
+
+ case -ERFKILL:
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED,
+ "Unit file %s is masked.", changes[i].path);
+ goto found;
+
+ case -EADDRNOTAVAIL:
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED,
+ "Unit %s is transient or generated.", changes[i].path);
+ goto found;
+
+ case -ETXTBSY:
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH,
+ "File %s is under the systemd unit hierarchy already.", changes[i].path);
+ goto found;
+
+ case -EBADSLT:
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Invalid specifier in %s.", changes[i].path);
+ goto found;
+
+ case -EIDRM:
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Destination unit %s is a non-template unit.", changes[i].path);
+ goto found;
+
+ case -EUCLEAN:
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "\"%s\" is not a valid unit name.",
+ changes[i].path);
+ goto found;
+
+ case -ELOOP:
+ r = sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED,
+ "Refusing to operate on alias name or linked unit file: %s",
+ changes[i].path);
+ goto found;
+
+ case -EXDEV:
+ if (changes[i].source)
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Cannot alias %s as %s.",
+ changes[i].source, changes[i].path);
+ else
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Invalid unit reference %s.", changes[i].path);
+ goto found;
+
+ case -ENOENT:
+ r = sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Unit file %s does not exist.", changes[i].path);
+ goto found;
+
+ case -EUNATCH:
+ r = sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Cannot resolve specifiers in %s.", changes[i].path);
+ goto found;
+
+ default:
+ assert(changes[i].type < 0); /* other errors */
+ r = sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path);
+ goto found;
+ }
+
+ r = c < 0 ? c : -EINVAL;
+
+ found:
+ install_changes_free(changes, n_changes);
+ return r;
+}
+
+static int reply_install_changes_and_free(
+ Manager *m,
+ sd_bus_message *message,
+ int carries_install_info,
+ InstallChange *changes,
+ size_t n_changes,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ bool bad = false, good = false;
+ int r;
+
+ if (install_changes_have_modification(changes, n_changes)) {
+ r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m");
+ }
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ goto fail;
+
+ if (carries_install_info >= 0) {
+ r = sd_bus_message_append(reply, "b", carries_install_info);
+ if (r < 0)
+ goto fail;
+ }
+
+ r = sd_bus_message_open_container(reply, 'a', "(sss)");
+ if (r < 0)
+ goto fail;
+
+ for (size_t i = 0; i < n_changes; i++) {
+
+ if (changes[i].type < 0) {
+ bad = true;
+ continue;
+ }
+
+ r = sd_bus_message_append(
+ reply, "(sss)",
+ install_change_type_to_string(changes[i].type),
+ changes[i].path,
+ changes[i].source);
+ if (r < 0)
+ goto fail;
+
+ good = true;
+ }
+
+ /* If there was a failed change, and no successful change, then return the first failure as proper
+ * method call error. */
+ if (bad && !good)
+ return install_error(error, 0, changes, n_changes);
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ goto fail;
+
+ install_changes_free(changes, n_changes);
+ return sd_bus_send(NULL, reply, NULL);
+
+fail:
+ install_changes_free(changes, n_changes);
+ return r;
+}
+
+static int method_enable_unit_files_generic(
+ sd_bus_message *message,
+ Manager *m,
+ int (*call)(LookupScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+ bool carries_install_info,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ UnitFileFlags flags;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_is_method_call(message, NULL, "EnableUnitFilesWithFlags")) {
+ uint64_t raw_flags;
+
+ r = sd_bus_message_read(message, "t", &raw_flags);
+ if (r < 0)
+ return r;
+ if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0)
+ return -EINVAL;
+ flags = raw_flags;
+ } else {
+ int runtime, force;
+
+ r = sd_bus_message_read(message, "bb", &runtime, &force);
+ if (r < 0)
+ return r;
+ flags = unit_file_bools_to_flags(runtime, force);
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = call(m->unit_file_scope, flags, NULL, l, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error);
+}
+
+static int method_enable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_enable, true, error);
+}
+
+static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_enable, true, error);
+}
+
+static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_reenable, true, error);
+}
+
+static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_link, false, error);
+}
+
+static int unit_file_preset_without_mode(LookupScope scope, UnitFileFlags flags, const char *root_dir, char **files, InstallChange **changes, size_t *n_changes) {
+ return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes);
+}
+
+static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, true, error);
+}
+
+static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_mask, false, error);
+}
+
+static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFilePresetMode preset_mode;
+ int runtime, force, r;
+ UnitFileFlags flags;
+ const char *mode;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ if (isempty(mode))
+ preset_mode = UNIT_FILE_PRESET_FULL;
+ else {
+ preset_mode = unit_file_preset_mode_from_string(mode);
+ if (preset_mode < 0)
+ return -EINVAL;
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_preset(m->unit_file_scope, flags, NULL, l, preset_mode, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, r, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_generic(
+ sd_bus_message *message,
+ Manager *m,
+ int (*call)(LookupScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ UnitFileFlags flags;
+ size_t n_changes = 0;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlags")) {
+ uint64_t raw_flags;
+
+ r = sd_bus_message_read(message, "t", &raw_flags);
+ if (r < 0)
+ return r;
+ if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0 ||
+ FLAGS_SET(raw_flags, UNIT_FILE_FORCE))
+ return -EINVAL;
+ flags = raw_flags;
+ } else {
+ int runtime;
+
+ r = sd_bus_message_read(message, "b", &runtime);
+ if (r < 0)
+ return r;
+ flags = unit_file_bools_to_flags(runtime, false);
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = call(m->unit_file_scope, flags, NULL, l, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_disable, error);
+}
+
+static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_disable, error);
+}
+
+static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_unmask, error);
+}
+
+static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_revert(m->unit_file_scope, NULL, l, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ int force, r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "enable", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sb", &name, &force);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_set_default(m->unit_file_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFilePresetMode preset_mode;
+ const char *mode;
+ UnitFileFlags flags;
+ int force, runtime, r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "enable", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ if (isempty(mode))
+ preset_mode = UNIT_FILE_PRESET_FULL;
+ else {
+ preset_mode = unit_file_preset_mode_from_string(mode);
+ if (preset_mode < 0)
+ return -EINVAL;
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_preset_all(m->unit_file_scope, flags, NULL, preset_mode, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ int runtime, force, r;
+ char *target, *type;
+ UnitDependency dep;
+ UnitFileFlags flags;
+
+ assert(message);
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ dep = unit_dependency_from_string(type);
+ if (dep < 0)
+ return -EINVAL;
+
+ r = unit_file_add_dependency(m->unit_file_scope, flags, NULL, l, target, dep, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ InstallChange *changes = NULL;
+ size_t n_changes = 0, i;
+ const char *name;
+ int runtime, r;
+
+ r = sd_bus_message_read(message, "sb", &name, &runtime);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s");
+ if (r < 0)
+ return r;
+
+ r = unit_file_disable(m->unit_file_scope,
+ UNIT_FILE_DRY_RUN | (runtime ? UNIT_FILE_RUNTIME : 0),
+ NULL, STRV_MAKE(name), &changes, &n_changes);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get file links for %s: %m", name);
+ goto finish;
+ }
+
+ for (i = 0; i < n_changes; i++)
+ if (changes[i].type == INSTALL_CHANGE_UNLINK) {
+ r = sd_bus_message_append(reply, "s", changes[i].path);
+ if (r < 0)
+ goto finish;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ goto finish;
+
+ r = sd_bus_send(NULL, reply, NULL);
+
+finish:
+ install_changes_free(changes, n_changes);
+ return r;
+}
+
+static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ return bus_job_method_get_waiting_jobs(message, j, error);
+}
+
+static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ if (u->type != UNIT_SCOPE)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit '%s' is not a scope unit, refusing.", name);
+
+ return bus_scope_method_abandon(message, u, error);
+}
+
+static int method_set_show_status(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ ShowStatus mode = _SHOW_STATUS_INVALID;
+ const char *t;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = sd_bus_message_read(message, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (!isempty(t)) {
+ mode = show_status_from_string(t);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid show status '%s'", t);
+ }
+
+ manager_override_show_status(m, mode, "bus");
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+const sd_bus_vtable bus_manager_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+ SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0),
+ SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0),
+ SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0),
+ SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0),
+ SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0),
+ SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0),
+ SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0),
+ SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, default_std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, default_std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogDevice", "s", property_get_watchdog_device, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogLastPingTimestamp", "t", property_get_watchdog_last_ping_realtime, 0, 0),
+ SD_BUS_PROPERTY("WatchdogLastPingTimestampMonotonic", "t", property_get_watchdog_last_ping_monotonic, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", property_get_runtime_watchdog, property_set_runtime_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreUSec", "t", property_get_pretimeout_watchdog, property_set_pretimeout_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreGovernor", "s", property_get_pretimeout_watchdog_governor, property_set_pretimeout_watchdog_governor, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RebootWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, 0),
+ /* The following item is an obsolete alias */
+ SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_WRITABLE_PROPERTY("KExecWatchdogUSec", "t", property_get_kexec_watchdog, property_set_kexec_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0),
+ SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0),
+ SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0),
+ SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0),
+ SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, default_timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0),
+ SD_BUS_PROPERTY("DefaultDeviceTimeoutUSec", "t", bus_property_get_usec, offsetof(Manager, default_device_timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, default_restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ /* The following two items are obsolete alias */
+ SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, default_start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, default_cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, default_blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, default_memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, default_tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, default_tasks_max), 0),
+ SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ SD_BUS_METHOD_WITH_ARGS("GetUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByPID",
+ SD_BUS_ARGS("u", pid),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_pid,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByInvocationID",
+ SD_BUS_ARGS("ay", invocation_id),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_invocation_id,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByControlGroup",
+ SD_BUS_ARGS("s", cgroup),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_control_group,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LoadUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("o", unit),
+ method_load_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_start_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnitWithFlags",
+ SD_BUS_ARGS("s", name, "s", mode, "t", flags),
+ SD_BUS_RESULT("o", job),
+ method_start_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnitReplace",
+ SD_BUS_ARGS("s", old_unit, "s", new_unit, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_start_unit_replace,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StopUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_stop_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("TryRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_try_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_or_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_or_try_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueUnitJob",
+ SD_BUS_ARGS("s", name, "s", job_type, "s", job_mode),
+ SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+ method_enqueue_unit_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("KillUnit",
+ SD_BUS_ARGS("s", name, "s", whom, "i", signal),
+ SD_BUS_NO_RESULT,
+ method_kill_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CleanUnit",
+ SD_BUS_ARGS("s", name, "as", mask),
+ SD_BUS_NO_RESULT,
+ method_clean_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("FreezeUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_freeze_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ThawUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_thaw_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ResetFailedUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_reset_failed_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetUnitProperties",
+ SD_BUS_ARGS("s", name, "b", runtime, "a(sv)", properties),
+ SD_BUS_NO_RESULT,
+ method_set_unit_properties,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("BindMountUnit",
+ SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir),
+ SD_BUS_NO_RESULT,
+ method_bind_mount_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("MountImageUnit",
+ SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+ SD_BUS_NO_RESULT,
+ method_mount_image_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RefUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_ref_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnrefUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_unref_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartTransientUnit",
+ SD_BUS_ARGS("s", name, "s", mode, "a(sv)", properties, "a(sa(sv))", aux),
+ SD_BUS_RESULT("o", job),
+ method_start_transient_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitProcesses",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("a(sus)", processes),
+ method_get_unit_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AttachProcessesToUnit",
+ SD_BUS_ARGS("s", unit_name, "s", subcgroup, "au", pids),
+ SD_BUS_NO_RESULT,
+ method_attach_processes_to_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AbandonScope",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_abandon_scope,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJob",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("o", job),
+ method_get_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJobAfter",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_get_job_waiting,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJobBefore",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_get_job_waiting,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CancelJob",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_NO_RESULT,
+ method_cancel_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ClearJobs",
+ NULL,
+ NULL,
+ method_clear_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ResetFailed",
+ NULL,
+ NULL,
+ method_reset_failed,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetShowStatus",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_NO_RESULT,
+ method_set_show_status,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnits",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsFiltered",
+ SD_BUS_ARGS("as", states),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_filtered,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsByPatterns",
+ SD_BUS_ARGS("as", states, "as", patterns),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_by_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsByNames",
+ SD_BUS_ARGS("as", names),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_by_names,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListJobs",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_list_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Subscribe",
+ NULL,
+ NULL,
+ method_subscribe,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Unsubscribe",
+ NULL,
+ NULL,
+ method_unsubscribe,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Dump",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("s", output),
+ method_dump,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatterns",
+ SD_BUS_ARGS("as", patterns),
+ SD_BUS_RESULT("s", output),
+ method_dump_units_matching_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpByFileDescriptor",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("h", fd),
+ method_dump_by_fd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatternsByFileDescriptor",
+ SD_BUS_ARGS("as", patterns),
+ SD_BUS_RESULT("h", fd),
+ method_dump_units_matching_patterns_by_fd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CreateSnapshot",
+ SD_BUS_ARGS("s", name, "b", cleanup),
+ SD_BUS_RESULT("o", unit),
+ method_refuse_snapshot,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_METHOD_WITH_ARGS("RemoveSnapshot",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_refuse_snapshot,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_METHOD("Reload",
+ NULL,
+ NULL,
+ method_reload,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Reexecute",
+ NULL,
+ NULL,
+ method_reexecute,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_METHOD_NO_REPLY),
+ SD_BUS_METHOD("Exit",
+ NULL,
+ NULL,
+ method_exit,
+ 0),
+ SD_BUS_METHOD("Reboot",
+ NULL,
+ NULL,
+ method_reboot,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("PowerOff",
+ NULL,
+ NULL,
+ method_poweroff,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("Halt",
+ NULL,
+ NULL,
+ method_halt,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("KExec",
+ NULL,
+ NULL,
+ method_kexec,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD_WITH_ARGS("SwitchRoot",
+ SD_BUS_ARGS("s", new_root, "s", init),
+ SD_BUS_NO_RESULT,
+ method_switch_root,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD_WITH_ARGS("SetEnvironment",
+ SD_BUS_ARGS("as", assignments),
+ SD_BUS_NO_RESULT,
+ method_set_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnsetEnvironment",
+ SD_BUS_ARGS("as", names),
+ SD_BUS_NO_RESULT,
+ method_unset_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnsetAndSetEnvironment",
+ SD_BUS_ARGS("as", names, "as", assignments),
+ SD_BUS_NO_RESULT,
+ method_unset_and_set_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueMarkedJobs",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("ao", jobs),
+ method_enqueue_marked_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitFiles",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(ss)", unit_files),
+ method_list_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitFilesByPatterns",
+ SD_BUS_ARGS("as", states, "as", patterns),
+ SD_BUS_RESULT("a(ss)", unit_files),
+ method_list_unit_files_by_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitFileState",
+ SD_BUS_ARGS("s", file),
+ SD_BUS_RESULT("s", state),
+ method_get_unit_file_state,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_enable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DisableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_disable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnableUnitFilesWithFlags",
+ SD_BUS_ARGS("as", files, "t", flags),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_enable_unit_files_with_flags,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlags",
+ SD_BUS_ARGS("as", files, "t", flags),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_disable_unit_files_with_flags,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReenableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_reenable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LinkUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_link_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_preset_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetUnitFilesWithMode",
+ SD_BUS_ARGS("as", files, "s", mode, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_preset_unit_files_with_mode,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("MaskUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_mask_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnmaskUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_unmask_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RevertUnitFiles",
+ SD_BUS_ARGS("as", files),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_revert_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetDefaultTarget",
+ SD_BUS_ARGS("s", name, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_set_default_target,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetDefaultTarget",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("s", name),
+ method_get_default_target,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetAllUnitFiles",
+ SD_BUS_ARGS("s", mode, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_preset_all_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AddDependencyUnitFiles",
+ SD_BUS_ARGS("as", files, "s", target, "s", type, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_add_dependency_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitFileLinks",
+ SD_BUS_ARGS("s", name, "b", runtime),
+ SD_BUS_RESULT("as", links),
+ method_get_unit_file_links,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetExitCode",
+ SD_BUS_ARGS("y", number),
+ SD_BUS_NO_RESULT,
+ method_set_exit_code,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByName",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("u", uid),
+ method_lookup_dynamic_user_by_name,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByUID",
+ SD_BUS_ARGS("u", uid),
+ SD_BUS_RESULT("s", name),
+ method_lookup_dynamic_user_by_uid,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetDynamicUsers",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(us)", users),
+ method_get_dynamic_users,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_SIGNAL_WITH_ARGS("UnitNew",
+ SD_BUS_ARGS("s", id, "o", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("UnitRemoved",
+ SD_BUS_ARGS("s", id, "o", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("JobNew",
+ SD_BUS_ARGS("u", id, "o", job, "s", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("JobRemoved",
+ SD_BUS_ARGS("u", id, "o", job, "s", unit, "s", result),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("StartupFinished",
+ SD_BUS_ARGS("t", firmware, "t", loader, "t", kernel, "t", initrd, "t", userspace, "t", total),
+ 0),
+ SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0),
+ SD_BUS_SIGNAL_WITH_ARGS("Reloading",
+ SD_BUS_ARGS("b", active),
+ 0),
+
+ SD_BUS_VTABLE_END
+};
+
+const sd_bus_vtable bus_manager_log_control_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ /* We define a private version of this interface here, since we want slightly different
+ * implementations for the setters. We'll still use the generic getters however, and we share the
+ * setters with the implementations for the Manager interface above (which pre-dates the generic
+ * service API interface). */
+
+ SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+ SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0),
+
+ SD_BUS_VTABLE_END,
+};
+
+static int send_finished(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ usec_t *times = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus,
+ &message,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "StartupFinished");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_finished(
+ Manager *m,
+ usec_t firmware_usec,
+ usec_t loader_usec,
+ usec_t kernel_usec,
+ usec_t initrd_usec,
+ usec_t userspace_usec,
+ usec_t total_usec) {
+
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(
+ m,
+ NULL,
+ send_finished,
+ (usec_t[6]) {
+ firmware_usec,
+ loader_usec,
+ kernel_usec,
+ initrd_usec,
+ userspace_usec,
+ total_usec
+ });
+ if (r < 0)
+ log_debug_errno(r, "Failed to send finished signal: %m");
+}
+
+static int send_reloading(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata));
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_reloading(Manager *m, bool active) {
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active));
+ if (r < 0)
+ log_debug_errno(r, "Failed to send reloading signal: %m");
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ assert(bus);
+
+ return sd_bus_emit_properties_changed_strv(bus,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ NULL);
+}
+
+void bus_manager_send_change_signal(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(m, NULL, send_changed_signal, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send manager change signal: %m");
+}
diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h
new file mode 100644
index 0000000..9b05080
--- /dev/null
+++ b/src/core/dbus-manager.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+#include "manager.h"
+
+extern const sd_bus_vtable bus_manager_vtable[];
+extern const sd_bus_vtable bus_manager_log_control_vtable[];
+
+void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec);
+void bus_manager_send_reloading(Manager *m, bool active);
+void bus_manager_send_change_signal(Manager *m);
+
+int verify_run_space_and_log(const char *message);
+
+int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_emergency_action(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c
new file mode 100644
index 0000000..73702b1
--- /dev/null
+++ b/src/core/dbus-mount.c
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-mount.h"
+#include "dbus-util.h"
+#include "mount.h"
+#include "string-util.h"
+#include "unit.h"
+
+static const char *mount_get_what(const Mount *m) {
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what)
+ return m->parameters_proc_self_mountinfo.what;
+ if (m->from_fragment && m->parameters_fragment.what)
+ return m->parameters_fragment.what;
+ return NULL;
+}
+
+static const char *mount_get_options(const Mount *m) {
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options)
+ return m->parameters_proc_self_mountinfo.options;
+ if (m->from_fragment && m->parameters_fragment.options)
+ return m->parameters_fragment.options;
+ return NULL;
+}
+
+static const char *mount_get_fstype(const Mount *m) {
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype)
+ return m->parameters_proc_self_mountinfo.fstype;
+ else if (m->from_fragment && m->parameters_fragment.fstype)
+ return m->parameters_fragment.fstype;
+ return NULL;
+}
+
+static BUS_DEFINE_PROPERTY_GET(property_get_what, "s", Mount, mount_get_what);
+static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Mount, mount_get_options);
+static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult);
+
+const sd_bus_vtable bus_mount_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadWriteOnly", "b", bus_property_get_bool, offsetof(Mount, read_write_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_mount_set_transient_property(
+ Mount *m,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(m);
+
+ assert(m);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Where"))
+ return bus_set_transient_path(u, name, &m->where, message, flags, error);
+
+ if (streq(name, "What"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error);
+
+ if (streq(name, "Options"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error);
+
+ if (streq(name, "Type"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error);
+
+ if (streq(name, "TimeoutUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error);
+
+ if (streq(name, "SloppyOptions"))
+ return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error);
+
+ if (streq(name, "LazyUnmount"))
+ return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error);
+
+ if (streq(name, "ForceUnmount"))
+ return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error);
+
+ if (streq(name, "ReadWriteOnly"))
+ return bus_set_transient_bool(u, name, &m->read_write_only, message, flags, error);
+
+ return 0;
+}
+
+int bus_mount_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's load a little more */
+
+ r = bus_mount_set_transient_property(m, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_mount_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h
new file mode 100644
index 0000000..5a848d3
--- /dev/null
+++ b/src/core/dbus-mount.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_mount_vtable[];
+
+int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_mount_commit_properties(Unit *u);
diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c
new file mode 100644
index 0000000..65fb7d7
--- /dev/null
+++ b/src/core/dbus-path.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-path.h"
+#include "dbus-util.h"
+#include "list.h"
+#include "path.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult);
+
+static int property_get_paths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Path *p = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(spec, k, p->specs) {
+ r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_path_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Path, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Path, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_path_set_transient_property(
+ Path *p,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(p);
+ int r;
+
+ assert(p);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "MakeDirectory"))
+ return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error);
+
+ if (streq(name, "Paths")) {
+ const char *type_name, *path;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) {
+ PathType t;
+
+ t = path_type_from_string(type_name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name);
+
+ if (isempty(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name);
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *k = NULL;
+ PathSpec *s;
+
+ k = strdup(path);
+ if (!k)
+ return -ENOMEM;
+
+ path_simplify(k);
+
+ s = new0(PathSpec, 1);
+ if (!s)
+ return -ENOMEM;
+
+ s->unit = u;
+ s->path = TAKE_PTR(k);
+ s->type = t;
+ s->inotify_fd = -1;
+
+ LIST_PREPEND(spec, p->specs, s);
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ path_free_specs(p);
+ unit_write_settingf(u, flags, name, "PathExists=");
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "TriggerLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &p->trigger_limit.burst, message, flags, error);
+
+ if (streq(name, "TriggerLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &p->trigger_limit.interval, message, flags, error);
+
+ return 0;
+}
+
+int bus_path_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags mode,
+ sd_bus_error *error) {
+
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB)
+ return bus_path_set_transient_property(p, name, message, mode, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h
new file mode 100644
index 0000000..b5018b0
--- /dev/null
+++ b/src/core/dbus-path.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_path_vtable[];
+
+int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
new file mode 100644
index 0000000..7b07bb8
--- /dev/null
+++ b/src/core/dbus-scope.c
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "scope.h"
+#include "selinux-access.h"
+#include "unit.h"
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Scope *s = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(UNIT(s)->manager, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = scope_abandon(s);
+ if (r == -ESTALE)
+ return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+
+const sd_bus_vtable bus_scope_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Scope, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_SIGNAL("RequestStop", NULL, 0),
+ SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_scope_set_transient_property(
+ Scope *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "TimeoutStopUSec"))
+ return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+ if (streq(name, "RuntimeMaxUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+ if (streq(name, "RuntimeRandomizedExtraUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+ if (streq(name, "OOMPolicy"))
+ return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+ if (streq(name, "PIDs")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "u");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ uint32_t upid;
+ pid_t pid;
+
+ r = sd_bus_message_read(message, "u", &upid);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (upid == 0) {
+ if (!creds) {
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ } else
+ pid = (uid_t) upid;
+
+ r = unit_pid_attachable(u, pid, error);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_watch_pid(u, pid, false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (n <= 0)
+ return -EINVAL;
+
+ return 1;
+
+ } else if (streq(name, "Controller")) {
+ const char *controller;
+
+ /* We can't support direct connections with this, as direct connections know no service or unique name
+ * concept, but the Controller field stores exactly that. */
+ if (sd_bus_message_get_bus(message) != u->manager->api_bus)
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus.");
+
+ r = sd_bus_message_read(message, "s", &controller);
+ if (r < 0)
+ return r;
+
+ if (!isempty(controller) && !sd_bus_service_name_is_valid(controller))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = free_and_strdup(&s->controller, empty_to_null(controller));
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_scope_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->load_state == UNIT_STUB) {
+ /* While we are created we still accept PIDs */
+
+ r = bus_scope_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (streq(name, "User"))
+ return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+ if (streq(name, "Group"))
+ return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+ }
+
+ return 0;
+}
+
+int bus_scope_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
+
+int bus_scope_send_request_stop(Scope *s) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(s);
+
+ if (!s->controller)
+ return 0;
+
+ p = unit_dbus_path(UNIT(s));
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ UNIT(s)->manager->api_bus,
+ &m,
+ p,
+ "org.freedesktop.systemd1.Scope",
+ "RequestStop");
+ if (r < 0)
+ return r;
+
+ return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL);
+}
+
+static int on_controller_gone(sd_bus_track *track, void *userdata) {
+ Scope *s = userdata;
+
+ assert(track);
+
+ if (s->controller) {
+ log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller);
+ unit_add_to_dbus_queue(UNIT(s));
+ s->controller = mfree(s->controller);
+ }
+
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ return 0;
+}
+
+int bus_scope_track_controller(Scope *s) {
+ int r;
+
+ assert(s);
+
+ if (!s->controller || s->controller_track)
+ return 0;
+
+ r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_track_add_name(s->controller_track, s->controller);
+ if (r < 0) {
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+ return r;
+ }
+
+ return 0;
+}
diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h
new file mode 100644
index 0000000..8f1bc02
--- /dev/null
+++ b/src/core/dbus-scope.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "scope.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_scope_vtable[];
+
+int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_scope_commit_properties(Unit *u);
+
+int bus_scope_send_request_stop(Scope *s);
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_scope_track_controller(Scope *s);
diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c
new file mode 100644
index 0000000..0fb4f43
--- /dev/null
+++ b/src/core/dbus-service.c
@@ -0,0 +1,604 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-service.h"
+#include "dbus-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "locale-util.h"
+#include "mount-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "selinux-access.h"
+#include "service.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exit_type, service_exit_type, ServiceExitType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_notify_access, notify_access, NotifyAccess);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction);
+static BUS_DEFINE_PROPERTY_GET(property_get_timeout_abort_usec, "t", Service, service_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET(property_get_watchdog_usec, "t", Service, service_get_watchdog_usec);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+static int property_get_exit_status_set(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const ExitStatusSet *status_set = ASSERT_PTR(userdata);
+ unsigned n;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "aiai");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "i");
+ if (r < 0)
+ return r;
+
+ BITMAP_FOREACH(n, &status_set->status) {
+ assert(n < 256);
+
+ r = sd_bus_message_append_basic(reply, 'i', &n);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "i");
+ if (r < 0)
+ return r;
+
+ BITMAP_FOREACH(n, &status_set->signal) {
+ const char *str;
+
+ str = signal_to_string(n);
+ if (!str)
+ continue;
+
+ r = sd_bus_message_append_basic(reply, 'i', &n);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_bus_error *error, bool is_image) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ const char *dest, *src, *propagate_directory;
+ int read_only, make_file_or_directory;
+ Unit *u = ASSERT_PTR(userdata);
+ ExecContext *c;
+ pid_t unit_pid;
+ int r;
+
+ assert(message);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Adding bind mounts at runtime is only supported for system managers.");
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(src) || !path_is_normalized(src))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized.");
+
+ if (!is_image && isempty(dest))
+ dest = src;
+ else if (!path_is_absolute(dest) || !path_is_normalized(dest))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized.");
+
+ if (is_image) {
+ r = bus_read_mount_options(message, error, &options, NULL, "");
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ is_image ? "mount-image" : "bind-mount",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to mount on '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ if (u->type != UNIT_SERVICE)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not of type .service");
+
+ /* If it would be dropped at startup time, return an error. The context should always be available, but
+ * there's an assert in exec_needs_mount_namespace, so double-check just in case. */
+ c = unit_get_exec_context(u);
+ if (!c)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot access unit execution context");
+ if (path_startswith_strv(dest, c->inaccessible_paths))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s is not accessible to this unit", dest);
+
+ /* Ensure that the unit was started in a private mount namespace */
+ if (!exec_needs_mount_namespace(c, NULL, unit_get_exec_runtime(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit not running in private mount namespace, cannot activate bind mount");
+
+ unit_pid = unit_main_pid(u);
+ if (unit_pid == 0 || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not running");
+
+ propagate_directory = strjoina("/run/systemd/propagate/", u->id);
+ if (is_image)
+ r = mount_image_in_namespace(unit_pid,
+ propagate_directory,
+ "/run/systemd/incoming/",
+ src, dest, read_only, make_file_or_directory, options);
+ else
+ r = bind_mount_in_namespace(unit_pid,
+ propagate_directory,
+ "/run/systemd/incoming/",
+ src, dest, read_only, make_file_or_directory);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in unit's namespace: %m", src, dest);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_service_method_mount(message, userdata, error, false);
+}
+
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_service_method_mount(message, userdata, error, true);
+}
+
+#if __SIZEOF_SIZE_T__ == 8
+static int property_get_size_as_uint32(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ size_t *value = ASSERT_PTR(userdata);
+ uint32_t sz = *value >= UINT32_MAX ? UINT32_MAX : (uint32_t) *value;
+
+ /* Returns a size_t as a D-Bus "u" type, i.e. as 32bit value, even if size_t is 64bit. We'll saturate if it doesn't fit. */
+
+ return sd_bus_message_append_basic(reply, 'u', &sz);
+}
+#elif __SIZEOF_SIZE_T__ == 4
+#define property_get_size_as_uint32 ((sd_bus_property_get_t) NULL)
+#else
+#error "Unexpected size of size_t"
+#endif
+
+const sd_bus_vtable bus_service_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExitType", "s", property_get_exit_type, offsetof(Service, exit_type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, offsetof(Service, notify_access), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0),
+ SD_BUS_PROPERTY("TimeoutStartFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_start_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutStopFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_stop_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Service, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogUSec", "t", property_get_watchdog_usec, 0, 0),
+ BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0),
+ SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */
+ SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NFileDescriptorStore", "u", property_get_size_as_uint32, offsetof(Service, n_fd_store), 0),
+ SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ReloadResult", "s", property_get_result, offsetof(Service, reload_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CleanResult", "s", property_get_result, offsetof(Service, clean_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecCondition", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecConditionEx", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPreEx", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartEx", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPostEx", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecReloadEx", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+
+ SD_BUS_METHOD_WITH_ARGS("BindMount",
+ SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir),
+ SD_BUS_NO_RESULT,
+ bus_service_method_bind_mount,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_METHOD_WITH_ARGS("MountImage",
+ SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+ SD_BUS_NO_RESULT,
+ bus_service_method_mount_image,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */
+ SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_set_transient_exit_status(
+ Unit *u,
+ const char *name,
+ ExitStatusSet *status_set,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const int32_t *status, *signal;
+ size_t n_status, n_signal, i;
+ int r;
+
+ r = sd_bus_message_enter_container(message, 'r', "aiai");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'i', (const void **) &status, &n_status);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &n_signal);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ n_status /= sizeof(int32_t);
+ n_signal /= sizeof(int32_t);
+
+ if (n_status == 0 && n_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) {
+ exit_status_set_free(status_set);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ return 1;
+ }
+
+ for (i = 0; i < n_status; i++) {
+ if (status[i] < 0 || status[i] > 255)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %"PRIi32, name, status[i]);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bitmap_set(&status_set->status, status[i]);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%"PRIi32, name, status[i]);
+ }
+ }
+
+ for (i = 0; i < n_signal; i++) {
+ const char *str;
+
+ str = signal_to_string((int) signal[i]);
+ if (!str)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %"PRIi32, name, signal[i]);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bitmap_set(&status_set->signal, signal[i]);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, str);
+ }
+ }
+
+ return 1;
+}
+
+static int bus_set_transient_std_fd(
+ Unit *u,
+ const char *name,
+ int *p,
+ bool *b,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int fd, r;
+
+ assert(p);
+ assert(b);
+
+ r = sd_bus_message_read(message, "h", &fd);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ int copy;
+
+ copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (copy < 0)
+ return -errno;
+
+ asynchronous_close(*p);
+ *p = copy;
+ *b = true;
+ }
+
+ return 1;
+}
+static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_exit_type, ServiceExitType, service_exit_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, sd_bus_service_name_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(timeout_failure_mode, ServiceTimeoutFailureMode, service_timeout_failure_mode_from_string);
+
+static int bus_service_set_transient_property(
+ Service *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(s);
+ ServiceExecCommand ci;
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "PermissionsStartOnly"))
+ return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error);
+
+ if (streq(name, "RootDirectoryStartOnly"))
+ return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error);
+
+ if (streq(name, "RemainAfterExit"))
+ return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error);
+
+ if (streq(name, "GuessMainPID"))
+ return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error);
+
+ if (streq(name, "Type"))
+ return bus_set_transient_service_type(u, name, &s->type, message, flags, error);
+
+ if (streq(name, "ExitType"))
+ return bus_set_transient_service_exit_type(u, name, &s->exit_type, message, flags, error);
+
+ if (streq(name, "OOMPolicy"))
+ return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+ if (streq(name, "RestartUSec"))
+ return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error);
+
+ if (streq(name, "TimeoutStartUSec")) {
+ r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ s->start_timeout_defined = true;
+
+ return r;
+ }
+
+ if (streq(name, "TimeoutStopUSec"))
+ return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+ if (streq(name, "TimeoutAbortUSec")) {
+ r = bus_set_transient_usec(u, name, &s->timeout_abort_usec, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ s->timeout_abort_set = true;
+ return r;
+ }
+
+ if (streq(name, "TimeoutStartFailureMode"))
+ return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_start_failure_mode, message, flags, error);
+
+ if (streq(name, "TimeoutStopFailureMode"))
+ return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_stop_failure_mode, message, flags, error);
+
+ if (streq(name, "RuntimeMaxUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+ if (streq(name, "RuntimeRandomizedExtraUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+ if (streq(name, "WatchdogUSec"))
+ return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error);
+
+ if (streq(name, "FileDescriptorStoreMax"))
+ return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error);
+
+ if (streq(name, "NotifyAccess"))
+ return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error);
+
+ if (streq(name, "PIDFile")) {
+ _cleanup_free_ char *n = NULL;
+ const char *v, *e;
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ if (!isempty(v)) {
+ n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+ if (!n)
+ return -ENOMEM;
+
+ path_simplify(n);
+
+ if (!path_is_normalized(n))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n);
+
+ e = path_startswith(n, "/var/run/");
+ if (e) {
+ char *z;
+
+ z = path_join("/run", e);
+ if (!z)
+ return log_oom();
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s %s %s; please update client accordingly.",
+ n, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), z);
+
+ free_and_replace(n, z);
+ }
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ free_and_replace(s->pid_file, n);
+ unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file));
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "USBFunctionDescriptors"))
+ return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error);
+
+ if (streq(name, "USBFunctionStrings"))
+ return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error);
+
+ if (streq(name, "BusName"))
+ return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error);
+
+ if (streq(name, "Restart"))
+ return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error);
+
+ if (streq(name, "RestartPreventExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error);
+
+ if (streq(name, "RestartForceExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error);
+
+ if (streq(name, "SuccessExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error);
+
+ ci = service_exec_command_from_string(name);
+ ci = (ci >= 0) ? ci : service_exec_ex_command_from_string(name);
+ if (ci >= 0)
+ return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error);
+
+ if (streq(name, "StandardInputFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ if (streq(name, "StandardOutputFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ if (streq(name, "StandardErrorFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ return 0;
+}
+
+int bus_service_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's load a little more */
+
+ r = bus_service_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_service_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h
new file mode 100644
index 0000000..9a05465
--- /dev/null
+++ b/src/core/dbus-service.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_service_vtable[];
+
+int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_commit_properties(Unit *u);
diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c
new file mode 100644
index 0000000..de41d65
--- /dev/null
+++ b/src/core/dbus-slice.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-cgroup.h"
+#include "dbus-slice.h"
+#include "slice.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_slice_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_VTABLE_END
+};
+
+int bus_slice_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Slice *s = SLICE(u);
+
+ assert(name);
+ assert(u);
+
+ return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_slice_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h
new file mode 100644
index 0000000..eb71916
--- /dev/null
+++ b/src/core/dbus-slice.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_slice_vtable[];
+
+int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_slice_commit_properties(Unit *u);
diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c
new file mode 100644
index 0000000..536483f
--- /dev/null
+++ b/src/core/dbus-socket.c
@@ -0,0 +1,482 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-socket.h"
+#include "dbus-util.h"
+#include "fd-util.h"
+#include "ip-protocol-list.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "socket-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only);
+static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timestamping, socket_timestamping, SocketTimestamping);
+
+static int property_get_listen(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Socket *s = SOCKET(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+ assert(s);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(port, p, s->ports) {
+ _cleanup_free_ char *address = NULL;
+ const char *a;
+
+ switch (p->type) {
+ case SOCKET_SOCKET: {
+ r = socket_address_print(&p->address, &address);
+ if (r)
+ return r;
+
+ a = address;
+ break;
+ }
+
+ case SOCKET_SPECIAL:
+ case SOCKET_MQUEUE:
+ case SOCKET_FIFO:
+ case SOCKET_USB_FUNCTION:
+ a = p->path;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), a);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_socket_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FlushPending", "b", bus_property_get_bool, offsetof(Socket, flush_pending), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReusePort", "b", bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0),
+ SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0),
+ SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0),
+ SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0),
+ SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+static bool check_size_t_truncation(uint64_t t) {
+ return (size_t) t == t;
+}
+
+static const char* socket_protocol_to_string(int32_t i) {
+ if (i == IPPROTO_IP)
+ return "";
+
+ if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP))
+ return NULL;
+
+ return ip_protocol_to_name(i);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32);
+static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(socket_timestamping, SocketTimestamping, socket_timestamping_from_string_harder);
+
+static int bus_socket_set_transient_property(
+ Socket *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ SocketExecCommand ci;
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Accept"))
+ return bus_set_transient_bool(u, name, &s->accept, message, flags, error);
+
+ if (streq(name, "FlushPending"))
+ return bus_set_transient_bool(u, name, &s->flush_pending, message, flags, error);
+
+ if (streq(name, "Writable"))
+ return bus_set_transient_bool(u, name, &s->writable, message, flags, error);
+
+ if (streq(name, "KeepAlive"))
+ return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error);
+
+ if (streq(name, "NoDelay"))
+ return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error);
+
+ if (streq(name, "FreeBind"))
+ return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error);
+
+ if (streq(name, "Transparent"))
+ return bus_set_transient_bool(u, name, &s->transparent, message, flags, error);
+
+ if (streq(name, "Broadcast"))
+ return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error);
+
+ if (streq(name, "PassCredentials"))
+ return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error);
+
+ if (streq(name, "PassSecurity"))
+ return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error);
+
+ if (streq(name, "PassPacketInfo"))
+ return bus_set_transient_bool(u, name, &s->pass_pktinfo, message, flags, error);
+
+ if (streq(name, "Timestamping"))
+ return bus_set_transient_socket_timestamping(u, name, &s->timestamping, message, flags, error);
+
+ if (streq(name, "ReusePort"))
+ return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error);
+
+ if (streq(name, "RemoveOnStop"))
+ return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error);
+
+ if (streq(name, "SELinuxContextFromNet"))
+ return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error);
+
+ if (streq(name, "Priority"))
+ return bus_set_transient_int(u, name, &s->priority, message, flags, error);
+
+ if (streq(name, "IPTTL"))
+ return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error);
+
+ if (streq(name, "Mark"))
+ return bus_set_transient_int(u, name, &s->mark, message, flags, error);
+
+ if (streq(name, "Backlog"))
+ return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error);
+
+ if (streq(name, "MaxConnections"))
+ return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error);
+
+ if (streq(name, "MaxConnectionsPerSource"))
+ return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error);
+
+ if (streq(name, "KeepAliveProbes"))
+ return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error);
+
+ if (streq(name, "TriggerLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error);
+
+ if (streq(name, "SocketMode"))
+ return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error);
+
+ if (streq(name, "MessageQueueMaxMessages"))
+ return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error);
+
+ if (streq(name, "MessageQueueMessageSize"))
+ return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error);
+
+ if (streq(name, "TimeoutUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error);
+
+ if (streq(name, "KeepAliveTimeUSec"))
+ return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error);
+
+ if (streq(name, "KeepAliveIntervalUSec"))
+ return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error);
+
+ if (streq(name, "DeferAcceptUSec"))
+ return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error);
+
+ if (streq(name, "TriggerLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error);
+
+ if (streq(name, "SmackLabel"))
+ return bus_set_transient_string(u, name, &s->smack, message, flags, error);
+
+ if (streq(name, "SmackLabelIPin"))
+ return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error);
+
+ if (streq(name, "SmackLabelIPOut"))
+ return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error);
+
+ if (streq(name, "TCPCongestion"))
+ return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error);
+
+ if (streq(name, "FileDescriptorName"))
+ return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error);
+
+ if (streq(name, "SocketUser"))
+ return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+ if (streq(name, "SocketGroup"))
+ return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+
+ if (streq(name, "BindIPv6Only"))
+ return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error);
+
+ if (streq(name, "ReceiveBuffer"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error);
+
+ if (streq(name, "SendBuffer"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error);
+
+ if (streq(name, "PipeSize"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error);
+
+ if (streq(name, "BindToDevice"))
+ return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error);
+
+ if (streq(name, "IPTOS"))
+ return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error);
+
+ if (streq(name, "SocketProtocol"))
+ return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error);
+
+ ci = socket_exec_command_from_string(name);
+ if (ci >= 0)
+ return bus_set_transient_exec_command(u, name,
+ &s->exec_command[ci],
+ message, flags, error);
+
+ if (streq(name, "Symlinks")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ s->symlinks = strv_free(s->symlinks);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&s->symlinks, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Listen")) {
+ const char *t, *a;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) {
+ _cleanup_(socket_port_freep) SocketPort *p = NULL;
+
+ p = new(SocketPort, 1);
+ if (!p)
+ return log_oom();
+
+ *p = (SocketPort) {
+ .fd = -1,
+ .socket = s,
+ };
+
+ p->type = socket_port_type_from_string(t);
+ if (p->type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t);
+
+ if (p->type != SOCKET_SOCKET) {
+ if (!path_is_absolute(a) || !path_is_valid(a))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", a);
+
+ p->path = strdup(a);
+ if (!p->path)
+ return log_oom();
+
+ path_simplify(p->path);
+
+ } else if (streq(t, "Netlink")) {
+ r = socket_address_parse_netlink(&p->address, a);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a);
+
+ } else {
+ r = socket_address_parse(&p->address, a);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a);
+
+ p->address.type = socket_address_type_from_string(t);
+ if (p->address.type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t);
+
+ if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a);
+ }
+
+ empty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ LIST_APPEND(port, s->ports, TAKE_PTR(p));
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a);
+ }
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ socket_free_ports(s);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream=");
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_socket_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's load a little more */
+
+ r = bus_socket_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_socket_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h
new file mode 100644
index 0000000..f9f36a2
--- /dev/null
+++ b/src/core/dbus-socket.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_socket_vtable[];
+
+int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_socket_commit_properties(Unit *u);
diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c
new file mode 100644
index 0000000..0fa8dd1
--- /dev/null
+++ b/src/core/dbus-swap.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-swap.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit.h"
+
+static int swap_get_priority(Swap *s) {
+ assert(s);
+
+ if (s->from_proc_swaps && s->parameters_proc_swaps.priority_set)
+ return s->parameters_proc_swaps.priority;
+
+ if (s->from_fragment && s->parameters_fragment.priority_set)
+ return s->parameters_fragment.priority;
+
+ return -1;
+}
+
+static const char *swap_get_options(Swap *s) {
+ assert(s);
+
+ if (s->from_fragment)
+ return s->parameters_fragment.options;
+
+ return NULL;
+}
+
+static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority);
+static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult);
+
+const sd_bus_vtable bus_swap_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+int bus_swap_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_swap_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h
new file mode 100644
index 0000000..9d651b5
--- /dev/null
+++ b/src/core/dbus-swap.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_swap_vtable[];
+
+int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_swap_commit_properties(Unit *u);
diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c
new file mode 100644
index 0000000..e979fb7
--- /dev/null
+++ b/src/core/dbus-target.c
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_target_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h
new file mode 100644
index 0000000..fedd4a9
--- /dev/null
+++ b/src/core/dbus-target.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_target_vtable[];
diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c
new file mode 100644
index 0000000..80dd40a
--- /dev/null
+++ b/src/core/dbus-timer.c
@@ -0,0 +1,376 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-timer.h"
+#include "dbus-util.h"
+#include "strv.h"
+#include "timer.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult);
+
+static int property_get_monotonic_timers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(stt)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ _cleanup_free_ char *buf = NULL;
+ const char *s;
+ size_t l;
+
+ if (v->base == TIMER_CALENDAR)
+ continue;
+
+ s = timer_base_to_string(v->base);
+ assert(endswith(s, "Sec"));
+
+ /* s/Sec/USec/ */
+ l = strlen(s);
+ buf = new(char, l+2);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, s, l-3);
+ memcpy(buf+l-3, "USec", 5);
+
+ r = sd_bus_message_append(reply, "(stt)", buf, v->value, v->next_elapse);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_calendar_timers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ _cleanup_free_ char *buf = NULL;
+
+ if (v->base != TIMER_CALENDAR)
+ continue;
+
+ r = calendar_spec_to_string(v->calendar_spec, &buf);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_next_elapse_monotonic(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t",
+ (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime,
+ TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC));
+}
+
+const sd_bus_vtable bus_timer_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("OnClockChange", "b", bus_property_get_bool, offsetof(Timer, on_clock_change), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnTimezoneChange", "b", bus_property_get_bool, offsetof(Timer, on_timezone_change), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FixedRandomDelay", "b", bus_property_get_bool, offsetof(Timer, fixed_random_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int timer_add_one_monotonic_spec(
+ Timer *t,
+ const char *name,
+ TimerBase base,
+ UnitWriteFlags flags,
+ usec_t usec,
+ sd_bus_error *error) {
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ TimerValue *v;
+
+ unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s",
+ timer_base_to_string(base),
+ FORMAT_TIMESPAN(usec, USEC_PER_MSEC));
+
+ v = new(TimerValue, 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (TimerValue) {
+ .base = base,
+ .value = usec,
+ };
+
+ LIST_PREPEND(value, t->values, v);
+ }
+
+ return 1;
+}
+
+static int timer_add_one_calendar_spec(
+ Timer *t,
+ const char *name,
+ TimerBase base,
+ UnitWriteFlags flags,
+ const char *str,
+ sd_bus_error *error) {
+
+ _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+ int r;
+
+ r = calendar_spec_from_string(str, &c);
+ if (r == -EINVAL)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec");
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s", timer_base_to_string(base), str);
+
+ TimerValue *v = new(TimerValue, 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (TimerValue) {
+ .base = base,
+ .calendar_spec = TAKE_PTR(c),
+ };
+
+ LIST_PREPEND(value, t->values, v);
+ }
+
+ return 1;
+};
+
+static int bus_timer_set_transient_property(
+ Timer *t,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(t);
+ int r;
+
+ assert(t);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "AccuracyUSec"))
+ return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error);
+
+ if (streq(name, "AccuracySec")) {
+ log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead.");
+ return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error);
+ }
+
+ if (streq(name, "RandomizedDelayUSec"))
+ return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error);
+
+ if (streq(name, "FixedRandomDelay"))
+ return bus_set_transient_bool(u, name, &t->fixed_random_delay, message, flags, error);
+
+ if (streq(name, "WakeSystem"))
+ return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error);
+
+ if (streq(name, "Persistent"))
+ return bus_set_transient_bool(u, name, &t->persistent, message, flags, error);
+
+ if (streq(name, "RemainAfterElapse"))
+ return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error);
+
+ if (streq(name, "OnTimezoneChange"))
+ return bus_set_transient_bool(u, name, &t->on_timezone_change, message, flags, error);
+
+ if (streq(name, "OnClockChange"))
+ return bus_set_transient_bool(u, name, &t->on_clock_change, message, flags, error);
+
+ if (streq(name, "TimersMonotonic")) {
+ const char *base_name;
+ usec_t usec;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) {
+ TimerBase b;
+
+ b = timer_base_from_string(base_name);
+ if (b < 0 || b == TIMER_CALENDAR)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid timer base: %s", base_name);
+
+ r = timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+ if (r < 0)
+ return r;
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ timer_free_values(t);
+ unit_write_setting(u, flags, name, "OnActiveSec=");
+ }
+
+ return 1;
+
+ } else if (streq(name, "TimersCalendar")) {
+ const char *base_name, *str;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) {
+ TimerBase b;
+
+ b = timer_base_from_string(base_name);
+ if (b != TIMER_CALENDAR)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid timer base: %s", base_name);
+
+ r = timer_add_one_calendar_spec(t, name, b, flags, str, error);
+ if (r < 0)
+ return r;
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ timer_free_values(t);
+ unit_write_setting(u, flags, name, "OnCalendar=");
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "OnActiveSec",
+ "OnBootSec",
+ "OnStartupSec",
+ "OnUnitActiveSec",
+ "OnUnitInactiveSec")) {
+
+ TimerBase b;
+ usec_t usec;
+
+ log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name);
+
+ b = timer_base_from_string(name);
+ if (b < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base %s", name);
+
+ r = sd_bus_message_read(message, "t", &usec);
+ if (r < 0)
+ return r;
+
+ return timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+
+ } else if (streq(name, "OnCalendar")) {
+
+ const char *str;
+
+ log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name);
+
+ r = sd_bus_message_read(message, "s", &str);
+ if (r < 0)
+ return r;
+
+ return timer_add_one_calendar_spec(t, name, TIMER_CALENDAR, flags, str, error);
+ }
+
+ return 0;
+}
+
+int bus_timer_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags mode,
+ sd_bus_error *error) {
+
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB)
+ return bus_timer_set_transient_property(t, name, message, mode, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h
new file mode 100644
index 0000000..ac436f1
--- /dev/null
+++ b/src/core/dbus-timer.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_timer_vtable[];
+
+int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
new file mode 100644
index 0000000..e2398c8
--- /dev/null
+++ b/src/core/dbus-unit.c
@@ -0,0 +1,2579 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-polkit.h"
+#include "cgroup-util.h"
+#include "condition.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "install.h"
+#include "locale-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "signal-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "web-util.h"
+
+static bool unit_can_start_refuse_manual(Unit *u) {
+ return unit_can_start(u) && !u->refuse_manual_start;
+}
+
+static bool unit_can_stop_refuse_manual(Unit *u) {
+ return unit_can_stop(u) && !u->refuse_manual_stop;
+}
+
+static bool unit_can_isolate_refuse_manual(Unit *u) {
+ return unit_can_isolate(u) && !u->refuse_manual_start;
+}
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode);
+static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description);
+static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_freezer_state, "s", Unit, unit_freezer_state, freezer_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_freeze, "b", Unit, unit_can_freeze);
+static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0);
+
+static int property_get_can_clean(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata;
+ ExecCleanMask mask;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_can_clean(u, &mask);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!FLAGS_SET(mask, 1U << t))
+ continue;
+
+ r = sd_bus_message_append(reply, "s", exec_resource_type_to_string(t));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_names(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", u->id);
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(t, u->aliases) {
+ r = sd_bus_message_append(reply, "s", t);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_following(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *f;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ f = unit_following(u);
+ return sd_bus_message_append(reply, "s", f ? f->id : NULL);
+}
+
+static int property_get_dependencies(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *other;
+ UnitDependency d;
+ Hashmap *deps;
+ void *v;
+ int r;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ d = unit_dependency_from_string(property);
+ assert_se(d >= 0);
+
+ deps = unit_get_dependencies(u, d);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(v, other, deps) {
+ r = sd_bus_message_append(reply, "s", other->id);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_requires_mounts_for(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Hashmap **h = ASSERT_PTR(userdata);
+ const char *p;
+ void *v;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(v, p, *h) {
+ r = sd_bus_message_append(reply, "s", p);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_unit_file_preset(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_unit_file_preset(u);
+
+ return sd_bus_message_append(reply, "s",
+ r < 0 ? NULL:
+ r > 0 ? "enabled" : "disabled");
+}
+
+static int property_get_job(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *p = NULL;
+ Job **j = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ if (!*j)
+ return sd_bus_message_append(reply, "(uo)", 0, "/");
+
+ p = job_dbus_path(*j);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_message_append(reply, "(uo)", (*j)->id, p);
+}
+
+static int property_get_conditions(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const char *(*to_string)(ConditionType type) = NULL;
+ Condition **list = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string;
+
+ r = sd_bus_message_open_container(reply, 'a', "(sbbsi)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(conditions, c, *list) {
+ int tristate;
+
+ tristate =
+ c->result == CONDITION_UNTESTED ? 0 :
+ c->result == CONDITION_SUCCEEDED ? 1 : -1;
+
+ r = sd_bus_message_append(reply, "(sbbsi)",
+ to_string(c->type),
+ c->trigger, c->negate,
+ c->parameter, tristate);
+ if (r < 0)
+ return r;
+
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_error(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = bus_unit_validate_load_state(u, &e);
+ if (r < 0)
+ return sd_bus_message_append(reply, "(ss)", e.name, e.message);
+
+ return sd_bus_message_append(reply, "(ss)", NULL, NULL);
+}
+
+static int property_get_markers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ unsigned *markers = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ /* Make sure out values fit in the bitfield. */
+ assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+ for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+ if (FLAGS_SET(*markers, 1u << m)) {
+ r = sd_bus_message_append(reply, "s", unit_marker_to_string(m));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = {
+ [JOB_START] = N_("Authentication is required to start '$(unit)'."),
+ [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."),
+ [JOB_RELOAD] = N_("Authentication is required to reload '$(unit)'."),
+ [JOB_RESTART] = N_("Authentication is required to restart '$(unit)'."),
+ [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."),
+};
+
+int bus_unit_method_start_generic(
+ sd_bus_message *message,
+ Unit *u,
+ JobType job_type,
+ bool reload_if_possible,
+ sd_bus_error *error) {
+
+ BusUnitQueueFlags job_flags = reload_if_possible ? BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE : 0;
+ const char *smode, *verb;
+ JobMode mode;
+ int r;
+
+ assert(message);
+ assert(u);
+ assert(job_type >= 0 && job_type < _JOB_TYPE_MAX);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(job_type),
+ error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &smode);
+ if (r < 0)
+ return r;
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+ if (reload_if_possible)
+ verb = strjoina("reload-or-", job_type_to_string(job_type));
+ else
+ verb = job_type_to_string(job_type);
+
+ if (sd_bus_message_is_method_call(message, NULL, "StartUnitWithFlags")) {
+ uint64_t input_flags = 0;
+
+ r = sd_bus_message_read(message, "t", &input_flags);
+ if (r < 0)
+ return r;
+ /* Let clients know that this version doesn't support any flags at the moment. */
+ if (input_flags != 0)
+ return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid 'flags' parameter '%" PRIu64 "'",
+ input_flags);
+ }
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ verb,
+ CAP_SYS_ADMIN,
+ polkit_message_for_job[job_type],
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ return bus_unit_queue_job(message, u, job_type, mode, job_flags, error);
+}
+
+static int bus_unit_method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_START, false, error);
+}
+
+static int bus_unit_method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error);
+}
+
+static int bus_unit_method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error);
+}
+
+static int bus_unit_method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error);
+}
+
+static int bus_unit_method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error);
+}
+
+static int bus_unit_method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error);
+}
+
+static int bus_unit_method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error);
+}
+
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ BusUnitQueueFlags flags = BUS_UNIT_QUEUE_VERBOSE_REPLY;
+ const char *jtype, *smode;
+ Unit *u = ASSERT_PTR(userdata);
+ JobType type;
+ JobMode mode;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "ss", &jtype, &smode);
+ if (r < 0)
+ return r;
+
+ /* Parse the two magic reload types "reload-or-…" manually */
+ if (streq(jtype, "reload-or-restart")) {
+ type = JOB_RESTART;
+ flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ } else if (streq(jtype, "reload-or-try-restart")) {
+ type = JOB_TRY_RESTART;
+ flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ } else {
+ /* And the rest generically */
+ type = job_type_from_string(jtype);
+ if (type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job type %s invalid", jtype);
+ }
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(type),
+ error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ jtype,
+ CAP_SYS_ADMIN,
+ polkit_message_for_job[type],
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ return bus_unit_queue_job(message, u, type, mode, flags, error);
+}
+
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ const char *swho;
+ int32_t signo;
+ KillWho who;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "si", &swho, &signo);
+ if (r < 0)
+ return r;
+
+ if (isempty(swho))
+ who = KILL_ALL;
+ else {
+ who = kill_who_from_string(swho);
+ if (who < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument %s", swho);
+ }
+
+ if (!SIGNAL_VALID(signo))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range.");
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "kill",
+ CAP_KILL,
+ N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_kill(u, who, signo, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "reset-failed",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to reset the \"failed\" state of '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ unit_reset_failed(u);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int runtime, r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &runtime);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "set-property",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to set properties on '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "ref",
+ CAP_SYS_ADMIN,
+ NULL,
+ false,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = bus_unit_track_add_sender(u, message);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = bus_unit_track_remove_sender(u, message);
+ if (r == -EUNATCH)
+ return sd_bus_error_set(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ ExecCleanMask mask = 0;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *i;
+
+ r = sd_bus_message_read(message, "s", &i);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (streq(i, "all"))
+ mask |= EXEC_CLEAN_ALL;
+ else {
+ ExecDirectoryType t;
+
+ t = exec_resource_type_from_string(i);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid resource type: %s", i);
+
+ mask |= 1U << t;
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "clean",
+ CAP_DAC_OVERRIDE,
+ N_("Authentication is required to delete files and directories associated with '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_clean(u, mask);
+ if (r == -EOPNOTSUPP)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support cleaning.", u->id);
+ if (r == -EUNATCH)
+ return sd_bus_error_set(error, BUS_ERROR_NOTHING_TO_CLEAN, "No matching resources found.");
+ if (r == -EBUSY)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit is not inactive or has pending job.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) {
+ const char* perm;
+ int (*method)(Unit*);
+ Unit *u = ASSERT_PTR(userdata);
+ bool reply_no_delay = false;
+ int r;
+
+ assert(message);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (action == FREEZER_FREEZE) {
+ perm = "stop";
+ method = unit_freeze;
+ } else {
+ perm = "start";
+ method = unit_thaw;
+ }
+
+ r = mac_selinux_unit_access_check(u, message, perm, error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ perm,
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = method(u);
+ if (r == -EOPNOTSUPP)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id);
+ if (r == -EBUSY)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job.");
+ if (r == -EHOSTDOWN)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive.");
+ if (r == -EALREADY)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ reply_no_delay = true;
+
+ if (u->pending_freezer_message) {
+ bus_unit_send_pending_freezer_message(u, true);
+ assert(!u->pending_freezer_message);
+ }
+
+ u->pending_freezer_message = sd_bus_message_ref(message);
+
+ if (reply_no_delay) {
+ r = bus_unit_send_pending_freezer_message(u, false);
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+}
+
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_THAW);
+}
+
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_FREEZE);
+}
+
+static int property_get_refs(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (const char *i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) {
+ int c;
+
+ c = sd_bus_track_count_name(u->bus_track, i);
+ if (c < 0)
+ return c;
+
+ /* Add the item multiple times if the ref count for each is above 1 */
+ for (int k = 0; k < c; k++) {
+ r = sd_bus_message_append(reply, "s", i);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_unit_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Names", "as", property_get_names, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0),
+ SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Upholds", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UpheldBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Before", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("After", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccess", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccessOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailureOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PropagatesStopTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("FreezerState", "s", property_get_freezer_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0),
+ SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0),
+ BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanClean", "as", property_get_can_clean, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanFreeze", "b", property_get_can_freeze, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
+ SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, 0),
+ SD_BUS_PROPERTY("Markers", "as", property_get_markers, offsetof(Unit, markers), 0),
+ SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobTimeoutAction", "s", bus_property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitAction", "s", bus_property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FailureAction", "s", bus_property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessAction", "s", bus_property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0),
+ SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Unit, activation_details), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+
+ SD_BUS_METHOD_WITH_ARGS("Start",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_start,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Stop",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_stop,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Reload",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Restart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("TryRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_try_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload_or_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload_or_try_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueJob",
+ SD_BUS_ARGS("s", job_type, "s", job_mode),
+ SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+ bus_unit_method_enqueue_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Kill",
+ SD_BUS_ARGS("s", whom, "i", signal),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_kill,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ResetFailed",
+ NULL,
+ NULL,
+ bus_unit_method_reset_failed,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetProperties",
+ SD_BUS_ARGS("b", runtime, "a(sv)", properties),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_set_properties,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Ref",
+ NULL,
+ NULL,
+ bus_unit_method_ref,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Unref",
+ NULL,
+ NULL,
+ bus_unit_method_unref,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Clean",
+ SD_BUS_ARGS("as", mask),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_clean,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Freeze",
+ NULL,
+ NULL,
+ bus_unit_method_freeze,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Thaw",
+ NULL,
+ NULL,
+ bus_unit_method_thaw,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ /* For dependency types we don't support anymore always return an empty array */
+ SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ /* Obsolete alias names */
+ SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+ SD_BUS_VTABLE_END
+};
+
+static int property_get_slice(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "s", unit_slice_name(u));
+}
+
+static int property_get_current_memory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t sz = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_memory_current(u, &sz);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get current memory usage from cgroup: %m");
+
+ return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_available_memory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t sz = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_memory_available(u, &sz);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get total available memory from cgroup: %m");
+
+ return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_current_tasks(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t cn = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_tasks_current(u, &cn);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m");
+
+ return sd_bus_message_append(reply, "t", cn);
+}
+
+static int property_get_cpu_usage(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ nsec_t ns = NSEC_INFINITY;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_cpu_usage(u, &ns);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m");
+
+ return sd_bus_message_append(reply, "t", ns);
+}
+
+static int property_get_cpuset_cpus(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet cpus = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective");
+ (void) cpu_set_to_dbus(&cpus, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cpuset_mems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet mems = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective");
+ (void) cpu_set_to_dbus(&mems, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cgroup(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ const char *t = NULL;
+
+ assert(bus);
+ assert(reply);
+
+ /* Three cases: a) u->cgroup_path is NULL, in which case the
+ * unit has no control group, which we report as the empty
+ * string. b) u->cgroup_path is the empty string, which
+ * indicates the root cgroup, which we report as "/". c) all
+ * other cases we report as-is. */
+
+ if (u->cgroup_path)
+ t = empty_to_root(u->cgroup_path);
+
+ return sd_bus_message_append(reply, "s", t);
+}
+
+static int append_process(sd_bus_message *reply, const char *p, pid_t pid, Set *pids) {
+ _cleanup_free_ char *buf = NULL, *cmdline = NULL;
+ int r;
+
+ assert(reply);
+ assert(pid > 0);
+
+ r = set_put(pids, PID_TO_PTR(pid));
+ if (IN_SET(r, 0, -EEXIST))
+ return 0;
+ if (r < 0)
+ return r;
+
+ if (!p) {
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf);
+ if (r == -ESRCH)
+ return 0;
+ if (r < 0)
+ return r;
+
+ p = buf;
+ }
+
+ (void) get_process_cmdline(pid, SIZE_MAX,
+ PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_QUOTE,
+ &cmdline);
+
+ return sd_bus_message_append(reply,
+ "(sus)",
+ p,
+ (uint32_t) pid,
+ cmdline);
+}
+
+static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) {
+ _cleanup_closedir_ DIR *d = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(reply);
+ assert(p);
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ pid_t pid;
+
+ /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels.
+ * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads,
+ * “cgroup.procs” in a threaded domain cgroup contains the PIDs of all processes in
+ * the subtree and is not readable in the subtree proper. */
+ r = cg_read_pid(f, &pid);
+ if (IN_SET(r, 0, -EOPNOTSUPP))
+ break;
+ if (r < 0)
+ return r;
+
+ if (is_kernel_thread(pid) > 0)
+ continue;
+
+ r = append_process(reply, p, pid, pids);
+ if (r < 0)
+ return r;
+ }
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ char *g = NULL, *j = NULL;
+
+ r = cg_read_subgroup(d, &g);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ j = path_join(empty_to_root(p), g);
+ if (!j)
+ return -ENOMEM;
+
+ r = append_cgroup(reply, j, pids);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_set_free_ Set *pids = NULL;
+ Unit *u = userdata;
+ pid_t pid;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ pids = set_new(NULL);
+ if (!pids)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(sus)");
+ if (r < 0)
+ return r;
+
+ if (u->cgroup_path) {
+ r = append_cgroup(reply, u->cgroup_path, pids);
+ if (r < 0)
+ return r;
+ }
+
+ /* The main and control pids might live outside of the cgroup, hence fetch them separately */
+ pid = unit_main_pid(u);
+ if (pid > 0) {
+ r = append_process(reply, NULL, pid, pids);
+ if (r < 0)
+ return r;
+ }
+
+ pid = unit_control_pid(u);
+ if (pid > 0) {
+ r = append_process(reply, NULL, pid, pids);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int property_get_ip_counter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ static const char *const table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes",
+ [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes",
+ [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
+ [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets",
+ };
+
+ uint64_t value = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ ssize_t metric;
+
+ assert(bus);
+ assert(reply);
+ assert(property);
+
+ assert_se((metric = string_table_lookup(table, ELEMENTSOF(table), property)) >= 0);
+ (void) unit_get_ip_accounting(u, metric, &value);
+ return sd_bus_message_append(reply, "t", value);
+}
+
+static int property_get_io_counter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ static const char *const table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "IOReadBytes",
+ [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes",
+ [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations",
+ [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
+ };
+
+ uint64_t value = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ ssize_t metric;
+
+ assert(bus);
+ assert(reply);
+ assert(property);
+
+ assert_se((metric = string_table_lookup(table, ELEMENTSOF(table), property)) >= 0);
+ (void) unit_get_io_accounting(u, metric, false, &value);
+ return sd_bus_message_append(reply, "t", value);
+}
+
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ _cleanup_set_free_ Set *pids = NULL;
+ Unit *u = userdata;
+ const char *path;
+ int r;
+
+ assert(message);
+
+ /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a
+ * specified cgroup path. Obviously this only works for units that actually maintain a cgroup
+ * representation. If a process is already in the cgroup no operation is executed – in this case the specified
+ * subcgroup path has no effect! */
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+
+ path = empty_to_null(path);
+ if (path) {
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
+ }
+
+ if (!unit_cgroup_delegate(u))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units.");
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing.");
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_enter_container(message, 'a', "u");
+ if (r < 0)
+ return r;
+ for (;;) {
+ uid_t process_uid, sender_uid;
+ uint32_t upid;
+ pid_t pid;
+
+ r = sd_bus_message_read(message, "u", &upid);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (upid == 0) {
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ } else
+ pid = (uid_t) upid;
+
+ /* Filter out duplicates */
+ if (set_contains(pids, PID_TO_PTR(pid)))
+ continue;
+
+ /* Check if this process is suitable for attaching to this unit */
+ r = unit_pid_attachable(u, pid, error);
+ if (r < 0)
+ return r;
+
+ /* Let's query the sender's UID, so that we can make our security decisions */
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0)
+ return r;
+
+ /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit,
+ * then the process' UID and the target unit's UID have to match the sender's UID */
+ if (sender_uid != 0 && sender_uid != getuid()) {
+ r = get_process_uid(pid, &process_uid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
+
+ if (process_uid != sender_uid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid);
+ if (process_uid != u->ref_uid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid);
+ }
+
+ if (!pids) {
+ pids = set_new(NULL);
+ if (!pids)
+ return -ENOMEM;
+ }
+
+ r = set_put(pids, PID_TO_PTR(pid));
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = unit_attach_pids_to_cgroup(u, pids, path);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m");
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+const sd_bus_vtable bus_unit_cgroup_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
+ SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0),
+ SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0),
+ SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0),
+ SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0),
+ SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0),
+ SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0),
+ SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0),
+ SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0),
+ SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IOReadBytes", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOReadOperations", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOWriteBytes", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0),
+
+ SD_BUS_METHOD_WITH_ARGS("GetProcesses",
+ SD_BUS_NO_ARGS,
+ SD_BUS_ARGS("a(sus)", processes),
+ bus_unit_method_get_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_METHOD_WITH_ARGS("AttachProcesses",
+ SD_BUS_ARGS("s", subcgroup, "au", pids),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_attach_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_VTABLE_END
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitNew");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "so", u->id, p);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ /* Send a properties changed signal. First for the specific
+ * type, then for the generic unit. The clients may rely on
+ * this order to get atomic behavior if needed. */
+
+ r = sd_bus_emit_properties_changed_strv(
+ bus, p,
+ unit_dbus_interface_from_type(u->type),
+ NULL);
+ if (r < 0)
+ return r;
+
+ return sd_bus_emit_properties_changed_strv(
+ bus, p,
+ "org.freedesktop.systemd1.Unit",
+ NULL);
+}
+
+void bus_unit_send_change_signal(Unit *u) {
+ int r;
+ assert(u);
+
+ if (u->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+ u->in_dbus_queue = false;
+ }
+
+ if (!u->id)
+ return;
+
+ r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id);
+
+ u->sent_dbus_new_signal = true;
+}
+
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new) {
+
+ /* Sends out any pending change signals, but only if they really are pending. This call is used when we are
+ * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending
+ * so that clients can follow the full state transition */
+
+ if (!u->in_dbus_queue) /* If not enqueued, don't bother */
+ return;
+
+ if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if
+ * the unit appears in the new state right-away (except if the
+ * caller explicitly asked us to send it anyway) */
+ return;
+
+ if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit
+ * when we are reloading. */
+ return;
+
+ bus_unit_send_change_signal(u);
+}
+
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ int r;
+
+ assert(u);
+
+ if (!u->pending_freezer_message)
+ return 0;
+
+ if (cancelled)
+ r = sd_bus_message_new_method_error(
+ u->pending_freezer_message,
+ &reply,
+ &SD_BUS_ERROR_MAKE_CONST(
+ BUS_ERROR_FREEZE_CANCELLED, "Freeze operation aborted"));
+ else
+ r = sd_bus_message_new_method_return(u->pending_freezer_message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_send(NULL, reply, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+ u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message);
+
+ return 0;
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitRemoved");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "so", u->id, p);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+void bus_unit_send_removed_signal(Unit *u) {
+ int r;
+ assert(u);
+
+ if (!u->sent_dbus_new_signal || u->in_dbus_queue)
+ bus_unit_send_change_signal(u);
+
+ if (!u->id)
+ return;
+
+ r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id);
+}
+
+int bus_unit_queue_job_one(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_message *reply,
+ sd_bus_error *error) {
+
+ _cleanup_set_free_ Set *affected = NULL;
+ _cleanup_free_ char *job_path = NULL, *unit_path = NULL;
+ Job *j, *a;
+ int r;
+
+ if (FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) {
+ affected = set_new(NULL);
+ if (!affected)
+ return -ENOMEM;
+ }
+
+ r = manager_add_job(u->manager, type, u, mode, affected, error, &j);
+ if (r < 0)
+ return r;
+
+ r = bus_job_track_sender(j, message);
+ if (r < 0)
+ return r;
+
+ /* Before we send the method reply, force out the announcement JobNew for this job */
+ bus_job_send_pending_change_signal(j, true);
+
+ job_path = job_dbus_path(j);
+ if (!job_path)
+ return -ENOMEM;
+
+ /* The classic response is just a job object path */
+ if (!FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY))
+ return sd_bus_message_append(reply, "o", job_path);
+
+ /* In verbose mode respond with the anchor job plus everything that has been affected */
+
+ unit_path = unit_dbus_path(j->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "uosos",
+ j->id, job_path,
+ j->unit->id, unit_path,
+ job_type_to_string(j->type));
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(uosos)");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(a, affected) {
+ if (a->id == j->id)
+ continue;
+
+ /* Free paths from previous iteration */
+ job_path = mfree(job_path);
+ unit_path = mfree(unit_path);
+
+ job_path = job_dbus_path(a);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(a->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "(uosos)",
+ a->id, job_path,
+ a->unit->id, unit_path,
+ job_type_to_string(a->type));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_unit_queue_job(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ int r;
+
+ assert(message);
+ assert(u);
+ assert(type >= 0 && type < _JOB_TYPE_MAX);
+ assert(mode >= 0 && mode < _JOB_MODE_MAX);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(type),
+ error);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE) && unit_can_reload(u)) {
+ if (type == JOB_RESTART)
+ type = JOB_RELOAD_OR_START;
+ else if (type == JOB_TRY_RESTART)
+ type = JOB_TRY_RELOAD;
+ }
+
+ if (type == JOB_STOP &&
+ IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) &&
+ unit_active_state(u) == UNIT_INACTIVE)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id);
+
+ if ((type == JOB_START && u->refuse_manual_start) ||
+ (type == JOB_STOP && u->refuse_manual_stop) ||
+ (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) ||
+ (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start))
+ return sd_bus_error_setf(error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id);
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = bus_unit_queue_job_one(message, u, type, mode, flags, reply, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int bus_unit_set_live_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(message);
+
+ /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for
+ * transient units that are being created). */
+
+ if (streq(name, "Description")) {
+ const char *d;
+
+ r = sd_bus_message_read(message, "s", &d);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_set_description(u, d);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d);
+ }
+
+ return 1;
+ }
+
+ /* A setting that only applies to active units. We don't actually write this to /run, this state is
+ * managed internally. "+foo" sets flag foo, "-foo" unsets flag foo, just "foo" resets flags to
+ * foo. The last type cannot be mixed with "+" or "-". */
+
+ if (streq(name, "Markers")) {
+ unsigned settings = 0, mask = 0;
+ bool some_plus_minus = false, some_absolute = false;
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *word;
+ bool b;
+
+ r = sd_bus_message_read(message, "s", &word);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (IN_SET(word[0], '+', '-')) {
+ b = word[0] == '+';
+ word++;
+ some_plus_minus = true;
+ } else {
+ b = true;
+ some_absolute = true;
+ }
+
+ UnitMarker m = unit_marker_from_string(word);
+ if (m < 0)
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Unknown marker \"%s\".", word);
+
+ SET_FLAG(settings, 1u << m, b);
+ SET_FLAG(mask, 1u << m, true);
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (some_plus_minus && some_absolute)
+ return sd_bus_error_set(error, BUS_ERROR_BAD_UNIT_SETTING, "Bad marker syntax.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (some_absolute)
+ u->markers = settings;
+ else
+ u->markers = settings | (u->markers & ~mask);
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_set_transient_emergency_action(
+ Unit *u,
+ const char *name,
+ EmergencyAction *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *s;
+ EmergencyAction v;
+ int r;
+ bool system;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ system = MANAGER_IS_SYSTEM(u->manager);
+ r = parse_emergency_action(s, system, &v);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s"
+ : "Invalid %s setting: %s",
+ name, s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_write_settingf(u, flags, name,
+ "%s=%s", name, s);
+ }
+
+ return 1;
+}
+
+static int bus_set_transient_exit_status(
+ Unit *u,
+ const char *name,
+ int *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int32_t k;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "i", &k);
+ if (r < 0)
+ return r;
+
+ if (k > 255)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = k < 0 ? -1 : k;
+
+ if (k < 0)
+ unit_write_settingf(u, flags, name, "%s=", name);
+ else
+ unit_write_settingf(u, flags, name, "%s=%i", name, k);
+ }
+
+ return 1;
+}
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string);
+
+static int bus_set_transient_conditions(
+ Unit *u,
+ const char *name,
+ Condition **list,
+ bool is_condition,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *type_name, *param;
+ int trigger, negate, r;
+ bool empty = true;
+
+ assert(list);
+
+ r = sd_bus_message_enter_container(message, 'a', "(sbbs)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, &param)) > 0) {
+ ConditionType t;
+
+ t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name);
+
+ if (isempty(param))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name);
+
+ if (condition_takes_path(t) && !path_is_absolute(param))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ Condition *c;
+
+ c = condition_new(t, param, trigger, negate);
+ if (!c)
+ return -ENOMEM;
+
+ LIST_PREPEND(conditions, *list, c);
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s%s%s", type_name,
+ trigger ? "|" : "", negate ? "!" : "", param);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ *list = condition_free_list(*list);
+ unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert");
+ }
+
+ return 1;
+}
+
+static int bus_unit_set_transient_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ UnitDependency d;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(message);
+
+ /* Handles settings when transient units are created. This settings cannot be altered anymore after
+ * the unit has been created. */
+
+ if (streq(name, "SourcePath"))
+ return bus_set_transient_path(u, name, &u->source_path, message, flags, error);
+
+ if (streq(name, "StopWhenUnneeded"))
+ return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error);
+
+ if (streq(name, "RefuseManualStart"))
+ return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error);
+
+ if (streq(name, "RefuseManualStop"))
+ return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error);
+
+ if (streq(name, "AllowIsolate"))
+ return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error);
+
+ if (streq(name, "DefaultDependencies"))
+ return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
+
+ if (streq(name, "OnSuccessJobMode"))
+ return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
+
+ if (streq(name, "OnFailureJobMode"))
+ return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error);
+
+ if (streq(name, "IgnoreOnIsolate"))
+ return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error);
+
+ if (streq(name, "JobTimeoutUSec")) {
+ r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set)
+ u->job_running_timeout = u->job_timeout;
+ }
+
+ if (streq(name, "JobRunningTimeoutUSec")) {
+ r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ u->job_running_timeout_set = true;
+
+ return r;
+ }
+
+ if (streq(name, "JobTimeoutAction"))
+ return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error);
+
+ if (streq(name, "JobTimeoutRebootArgument"))
+ return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error);
+
+ if (streq(name, "StartLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error);
+
+ if (streq(name, "StartLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &u->start_ratelimit.burst, message, flags, error);
+
+ if (streq(name, "StartLimitAction"))
+ return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error);
+
+ if (streq(name, "FailureAction"))
+ return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error);
+
+ if (streq(name, "SuccessAction"))
+ return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error);
+
+ if (streq(name, "FailureActionExitStatus"))
+ return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error);
+
+ if (streq(name, "SuccessActionExitStatus"))
+ return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error);
+
+ if (streq(name, "RebootArgument"))
+ return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error);
+
+ if (streq(name, "CollectMode"))
+ return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error);
+
+ if (streq(name, "Conditions"))
+ return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error);
+
+ if (streq(name, "Asserts"))
+ return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error);
+
+ if (streq(name, "Documentation")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!documentation_url_is_valid(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ u->documentation = strv_free(u->documentation);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ strv_extend_strv(&u->documentation, l, false);
+
+ STRV_FOREACH(p, l)
+ unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Slice")) {
+ Unit *slice;
+ const char *s;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups.");
+ if (u->type == UNIT_SLICE)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units.");
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope");
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!unit_name_is_valid(s, UNIT_NAME_PLAIN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s);
+
+ /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be
+ * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare()
+ * instead of manager_load_unit() on purpose, here. */
+ r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice);
+ if (r < 0)
+ return r;
+
+ if (slice->type != UNIT_SLICE)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_set_slice(u, slice);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "RequiresMountsFor")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ path_simplify(*p);
+
+ if (!path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p);
+
+ if (!path_is_valid(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s has invalid length: %s", name, *p);
+
+ if (!path_is_normalized(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p);
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RequiresOverridable"))
+ d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */
+ else if (streq(name, "RequisiteOverridable"))
+ d = UNIT_REQUISITE; /* same here */
+ else
+ d = unit_dependency_from_string(name);
+
+ if (d >= 0) {
+ const char *other;
+
+ if (!IN_SET(d,
+ UNIT_REQUIRES,
+ UNIT_REQUISITE,
+ UNIT_WANTS,
+ UNIT_BINDS_TO,
+ UNIT_PART_OF,
+ UNIT_UPHOLDS,
+ UNIT_CONFLICTS,
+ UNIT_BEFORE,
+ UNIT_AFTER,
+ UNIT_ON_SUCCESS,
+ UNIT_ON_FAILURE,
+ UNIT_PROPAGATES_RELOAD_TO,
+ UNIT_RELOAD_PROPAGATED_FROM,
+ UNIT_PROPAGATES_STOP_TO,
+ UNIT_STOP_PROPAGATED_FROM,
+ UNIT_JOINS_NAMESPACE_OF))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Dependency type %s may not be created transiently.", unit_dependency_to_string(d));
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "s", &other)) > 0) {
+ if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *label = NULL;
+
+ r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ label = strjoin(name, "-", other);
+ if (!label)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other);
+ }
+
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ } else if (streq(name, "AddRef")) {
+
+ int b;
+
+ /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method
+ * on the Unit interface, and it's probably not a good idea to expose a property and a method on the
+ * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for
+ * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference
+ * dependency type, hence let's not confuse things with that.
+ *
+ * Note that we don't actually add the reference to the bus track. We do that only after the setup of
+ * the transient unit is complete, so that setting this property multiple times in the same transient
+ * unit creation call doesn't count as individual references. */
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ u->bus_track_add = b;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_unit_set_properties(
+ Unit *u,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ bool commit,
+ sd_bus_error *error) {
+
+ bool for_real = false;
+ unsigned n = 0;
+ int r;
+
+ assert(u);
+ assert(message);
+
+ /* We iterate through the array twice. First run we just check
+ * if all passed data is valid, second run actually applies
+ * it. This is to implement transaction-like behaviour without
+ * actually providing full transactions. */
+
+ r = sd_bus_message_enter_container(message, 'a', "(sv)");
+ if (r < 0)
+ goto error;
+
+ for (;;) {
+ const char *name;
+ UnitWriteFlags f;
+
+ r = sd_bus_message_enter_container(message, 'r', "sv");
+ if (r < 0)
+ goto error;
+ if (r == 0) {
+ if (for_real || UNIT_WRITE_FLAGS_NOOP(flags))
+ break;
+
+ /* Reached EOF. Let's try again, and this time for realz... */
+ r = sd_bus_message_rewind(message, false);
+ if (r < 0)
+ goto error;
+
+ for_real = true;
+ continue;
+ }
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ goto error;
+
+ r = sd_bus_message_enter_container(message, 'v', NULL);
+ if (r < 0)
+ goto error;
+
+ /* If not for real, then mask out the two target flags */
+ f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT));
+
+ if (UNIT_VTABLE(u)->bus_set_property)
+ r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error);
+ else
+ r = 0;
+ if (r == 0 && u->transient && u->load_state == UNIT_STUB)
+ r = bus_unit_set_transient_property(u, name, message, f, error);
+ if (r == 0)
+ r = bus_unit_set_live_property(u, name, message, f, error);
+ if (r < 0)
+ goto error;
+
+ if (r == 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY,
+ "Cannot set property %s, or unknown property.", name);
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ n += for_real;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties)
+ UNIT_VTABLE(u)->bus_commit_properties(u);
+
+ return n;
+
+ error:
+ /* Pretty much any of the calls above can fail if the message is not formed properly
+ * or if it has unexpected contents. Fill in a more informative error message here. */
+ if (sd_bus_error_is_set(error))
+ return r;
+ return sd_bus_error_set_errnof(error, r,
+ r == -ENXIO ? "Failed to set unit properties: Unexpected message contents"
+ : "Failed to set unit properties: %m");
+}
+
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) {
+ assert(u);
+
+ /* Generates a pretty error if a unit isn't properly loaded. */
+
+ switch (u->load_state) {
+
+ case UNIT_LOADED:
+ return 0;
+
+ case UNIT_NOT_FOUND:
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id);
+
+ case UNIT_BAD_SETTING:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id);
+
+ case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */
+ return sd_bus_error_set_errnof(error, u->load_error,
+ "Unit %s failed to load properly, please adjust/correct and reload service manager: %m", u->id);
+
+ case UNIT_MASKED:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id);
+
+ case UNIT_STUB:
+ case UNIT_MERGED:
+ default:
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id);
+ }
+}
+
+static int bus_unit_track_handler(sd_bus_track *t, void *userdata) {
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(t);
+
+ u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */
+
+ /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too,
+ * let's see */
+ unit_add_to_cgroup_empty_queue(u);
+
+ /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */
+ unit_add_to_gc_queue(u);
+
+ return 0;
+}
+
+static int bus_unit_allocate_bus_track(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->bus_track)
+ return 0;
+
+ r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_track_set_recursive(u->bus_track, true);
+ if (r < 0) {
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_unit_track_add_name(Unit *u, const char *name) {
+ int r;
+
+ assert(u);
+
+ r = bus_unit_allocate_bus_track(u);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_name(u->bus_track, name);
+}
+
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) {
+ int r;
+
+ assert(u);
+
+ r = bus_unit_allocate_bus_track(u);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_sender(u->bus_track, m);
+}
+
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) {
+ assert(u);
+
+ /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet,
+ * return an error */
+ if (!u->bus_track)
+ return -EUNATCH;
+
+ return sd_bus_track_remove_sender(u->bus_track, m);
+}
diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h
new file mode 100644
index 0000000..6b7828e
--- /dev/null
+++ b/src/core/dbus-unit.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_unit_vtable[];
+extern const sd_bus_vtable bus_unit_cgroup_vtable[];
+
+void bus_unit_send_change_signal(Unit *u);
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new);
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled);
+void bus_unit_send_removed_signal(Unit *u);
+
+int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error);
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error);
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+typedef enum BusUnitQueueFlags {
+ BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE = 1 << 0,
+ BUS_UNIT_QUEUE_VERBOSE_REPLY = 1 << 1,
+} BusUnitQueueFlags;
+
+int bus_unit_queue_job_one(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_message *reply,
+ sd_bus_error *error);
+int bus_unit_queue_job(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_error *error);
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error);
+
+int bus_unit_track_add_name(Unit *u, const char *name);
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m);
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m);
diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c
new file mode 100644
index 0000000..ea044b5
--- /dev/null
+++ b/src/core/dbus-util.c
@@ -0,0 +1,262 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-util.h"
+#include "escape.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *trigger;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ trigger = UNIT_TRIGGER(u);
+
+ return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL);
+}
+
+BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%04o");
+BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32);
+
+static inline bool valid_user_group_name_or_id_relaxed(const char *u) {
+ return valid_user_group_name(u, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX);
+}
+
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed);
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute);
+
+int bus_set_transient_string(
+ Unit *u,
+ const char *name,
+ char **p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = free_and_strdup(p, empty_to_null(v));
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s", name, strempty(v));
+ }
+
+ return 1;
+}
+
+int bus_set_transient_bool(
+ Unit *u,
+ const char *name,
+ bool *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int v, r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "b", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+ }
+
+ return 1;
+}
+
+int bus_set_transient_usec_internal(
+ Unit *u,
+ const char *name,
+ usec_t *p,
+ bool fix_0,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint64_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (fix_0)
+ *p = v != 0 ? v: USEC_INFINITY;
+ else
+ *p = v;
+
+ char *n = strndupa_safe(name, strlen(name) - 4);
+ unit_write_settingf(u, flags, name, "%sSec=%s", n, FORMAT_TIMESPAN(v, USEC_PER_MSEC));
+ }
+
+ return 1;
+}
+
+int bus_verify_manage_units_async_full(
+ Unit *u,
+ const char *verb,
+ int capability,
+ const char *polkit_message,
+ bool interactive,
+ sd_bus_message *call,
+ sd_bus_error *error) {
+
+ const char *details[9] = {
+ "unit", u->id,
+ "verb", verb,
+ };
+
+ if (polkit_message) {
+ details[4] = "polkit.message";
+ details[5] = polkit_message;
+ details[6] = "polkit.gettext_domain";
+ details[7] = GETTEXT_PACKAGE;
+ }
+
+ return bus_verify_polkit_async(
+ call,
+ capability,
+ "org.freedesktop.systemd1.manage-units",
+ details,
+ interactive,
+ UID_INVALID,
+ &u->manager->polkit_registry,
+ error);
+}
+
+/* ret_format_str is an accumulator, so if it has any pre-existing content, new options will be appended to it */
+int bus_read_mount_options(
+ sd_bus_message *message,
+ sd_bus_error *error,
+ MountOptions **ret_options,
+ char **ret_format_str,
+ const char *separator) {
+
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *format_str = NULL;
+ const char *mount_options, *partition;
+ int r;
+
+ assert(message);
+ assert(ret_options);
+ assert(separator);
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &partition, &mount_options)) > 0) {
+ _cleanup_free_ char *escaped = NULL;
+ _cleanup_free_ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ if (chars_intersect(mount_options, WHITESPACE))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid mount options string, contains whitespace character(s): %s", mount_options);
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid partition name %s", partition);
+
+ /* Need to store the options with the escapes, so that they can be parsed again */
+ escaped = shell_escape(mount_options, ":");
+ if (!escaped)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&format_str, separator, partition, ":", escaped))
+ return -ENOMEM;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return -ENOMEM;
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = strdup(mount_options),
+ };
+ if (!o->options)
+ return -ENOMEM;
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (options) {
+ if (ret_format_str) {
+ char *final = strjoin(*ret_format_str, !isempty(*ret_format_str) ? separator : "", format_str);
+ if (!final)
+ return -ENOMEM;
+ free_and_replace(*ret_format_str, final);
+ }
+ LIST_JOIN(mount_options, *ret_options, options);
+ }
+
+ return 0;
+}
+
+int bus_property_get_activation_details(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ActivationDetails **details = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **pairs = NULL;
+ int r;
+
+ assert(reply);
+
+ r = activation_details_append_pair(*details, &pairs);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH_PAIR(key, value, pairs) {
+ r = sd_bus_message_append(reply, "(ss)", *key, *value);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h
new file mode 100644
index 0000000..e12631a
--- /dev/null
+++ b/src/core/dbus-util.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "dissect-image.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=" fmt, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=" fmt, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ s = to_string(v); \
+ if (!s) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, s); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ _cleanup_free_ char *s = NULL; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ r = to_string(v, &s); \
+ if (r == -EINVAL) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", \
+ name, strempty(s)); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &s); \
+ if (r < 0) \
+ return r; \
+ \
+ v = parse(s); \
+ if (v < 0) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, s); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, s); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &s); \
+ if (r < 0) \
+ return r; \
+ \
+ r = parse(s, &v); \
+ if (r < 0) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, s); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, strempty(s)); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ char **p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!isempty(v) && !check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ r = free_and_strdup(p, empty_to_null(v)); \
+ if (r < 0) \
+ return r; \
+ \
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \
+ "%s=%s", name, strempty(v)); \
+ } \
+ \
+ return 1; \
+ }
+
+int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+ return bus_set_transient_usec_internal(u, name, p, false, message, flags, error);
+}
+static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+ return bus_set_transient_usec_internal(u, name, p, true, message, flags, error);
+}
+int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error);
+
+int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator);
+
+int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
diff --git a/src/core/dbus.c b/src/core/dbus.c
new file mode 100644
index 0000000..a21bfeb
--- /dev/null
+++ b/src/core/dbus.c
@@ -0,0 +1,1247 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-cgroup.h"
+#include "dbus-device.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-mount.h"
+#include "dbus-path.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-slice.h"
+#include "dbus-socket.h"
+#include "dbus-swap.h"
+#include "dbus-target.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "serialize.h"
+#include "service.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define CONNECTIONS_MAX 4096
+
+static void destroy_bus(Manager *m, sd_bus **bus);
+
+int bus_send_pending_reload_message(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->pending_reload_message)
+ return 0;
+
+ /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting
+ * to queue another message. */
+
+ r = sd_bus_send(NULL, m->pending_reload_message, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+ m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+ return 0;
+}
+
+int bus_forward_agent_released(Manager *m, const char *path) {
+ int r;
+
+ assert(m);
+ assert(path);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ if (!m->system_bus)
+ return 0;
+
+ /* If we are running a system instance we forward the agent message on the system bus, so that the user
+ * instances get notified about this, too */
+
+ r = sd_bus_emit_signal(m->system_bus,
+ "/org/freedesktop/systemd1/agent",
+ "org.freedesktop.systemd1.Agent",
+ "Released",
+ "s", path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to propagate agent release message: %m");
+
+ return 1;
+}
+
+static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *cgroup;
+ uid_t sender_uid;
+ int r;
+
+ assert(message);
+
+ /* only accept org.freedesktop.systemd1.Agent from UID=0 */
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0 || sender_uid != 0)
+ return 0;
+
+ /* parse 'cgroup-empty' notification */
+ r = sd_bus_message_read(message, "s", &cgroup);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ manager_notify_cgroup_empty(m, cgroup);
+ return 0;
+}
+
+static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ sd_bus *bus;
+
+ assert(message);
+ assert_se(bus = sd_bus_message_get_bus(message));
+
+ if (bus == m->api_bus)
+ bus_done_api(m);
+ if (bus == m->system_bus)
+ bus_done_system(m);
+
+ if (set_remove(m->private_buses, bus)) {
+ log_debug("Got disconnect on private connection.");
+ destroy_bus(m, &bus);
+ }
+
+ return 0;
+}
+
+static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE) ||
+ manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET)) {
+ r = sd_bus_error_set(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down.");
+ goto failed;
+ }
+
+ r = manager_load_unit(m, name, NULL, &error, &u);
+ if (r < 0)
+ goto failed;
+
+ if (u->refuse_manual_start) {
+ r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id);
+ goto failed;
+ }
+
+ r = manager_add_job(m, JOB_START, u, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ goto failed;
+
+ /* Successfully queued, that's it for us */
+ return 0;
+
+failed:
+ if (!sd_bus_error_is_set(&error))
+ sd_bus_error_set_errno(&error, r);
+
+ log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r));
+
+ r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure");
+ if (r < 0) {
+ bus_log_create_error(r);
+ return 0;
+ }
+
+ r = sd_bus_message_append(reply, "sss", name, error.name, error.message);
+ if (r < 0) {
+ bus_log_create_error(r);
+ return 0;
+ }
+
+ r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to respond with to bus activation request: %m");
+
+ return 0;
+}
+
+#if HAVE_SELINUX
+static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = userdata;
+ const char *verb, *path;
+ Unit *u = NULL;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Our own method calls are all protected individually with
+ * selinux checks, but the built-in interfaces need to be
+ * protected too. */
+
+ if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set"))
+ verb = "reload";
+ else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL))
+ verb = "status";
+ else
+ return 0;
+
+ path = sd_bus_message_get_path(message);
+
+ if (object_path_startswith("/org/freedesktop/systemd1", path)) {
+ r = mac_selinux_access_check(message, verb, error);
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return 0;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return 0;
+
+ u = manager_get_unit_by_pid(m, pid);
+ } else {
+ r = manager_get_job_from_dbus_path(m, path, &j);
+ if (r >= 0)
+ u = j->unit;
+ else
+ manager_load_unit_from_dbus_path(m, path, NULL, &u);
+ }
+ if (!u)
+ return 0;
+
+ r = mac_selinux_unit_access_check(u, message, verb, error);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+#endif
+
+static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) {
+ Unit *u = NULL; /* just to appease gcc, initialization is not really necessary */
+ int r;
+
+ assert(m);
+ assert(bus);
+ assert(path);
+
+ if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ sd_bus_message *message;
+ pid_t pid;
+
+ message = sd_bus_get_current_message(bus);
+ if (!message)
+ return 0;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return 0;
+ } else {
+ r = manager_load_unit_from_dbus_path(m, path, error, &u);
+ if (r < 0)
+ return 0;
+ assert(u);
+ }
+
+ *unit = u;
+ return 1;
+}
+
+static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ return find_unit(m, bus, path, (Unit**) found, error);
+}
+
+static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ *found = u;
+ return 1;
+}
+
+static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ *found = u;
+ return 1;
+}
+
+static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ CGroupContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ ExecContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_exec_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ KillContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_kill_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = userdata;
+ unsigned k = 0;
+ Unit *u;
+
+ l = new0(char*, hashmap_size(m->units)+1);
+ if (!l)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(u, m->units) {
+ l[k] = unit_dbus_path(u);
+ if (!l[k])
+ return -ENOMEM;
+
+ k++;
+ }
+
+ *nodes = TAKE_PTR(l);
+
+ return k;
+}
+
+static const BusObjectImplementation unit_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Unit",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_unit_vtable, bus_unit_find }),
+ .node_enumerator = bus_unit_enumerate,
+};
+
+static const BusObjectImplementation bus_automount_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Automount",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_automount_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_device_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Device",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_device_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_mount_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Mount",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_mount_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_path_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Path",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_path_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_scope_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Scope",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_scope_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_service_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Service",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_service_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_slice_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Slice",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_slice_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find }),
+};
+
+static const BusObjectImplementation bus_socket_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Socket",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_socket_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_swap_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Swap",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_swap_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_target_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Target",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_target_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_timer_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Timer",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_timer_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_manager_object = {
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ .vtables = BUS_VTABLES(bus_manager_vtable),
+ .children = BUS_IMPLEMENTATIONS(
+ &job_object,
+ &unit_object,
+ &bus_automount_object,
+ &bus_device_object,
+ &bus_mount_object,
+ &bus_path_object,
+ &bus_scope_object,
+ &bus_service_object,
+ &bus_slice_object,
+ &bus_socket_object,
+ &bus_swap_object,
+ &bus_target_object,
+ &bus_timer_object),
+};
+
+static const BusObjectImplementation manager_log_control_object = {
+ "/org/freedesktop/LogControl1",
+ "org.freedesktop.LogControl1",
+ .vtables = BUS_VTABLES(bus_manager_log_control_vtable),
+};
+
+int bus_manager_introspect_implementations(FILE *out, const char *pattern) {
+ return bus_introspect_implementations(
+ out,
+ pattern,
+ BUS_IMPLEMENTATIONS(&bus_manager_object,
+ &manager_log_control_object));
+}
+
+static int bus_setup_api_vtables(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+#if HAVE_SELINUX
+ r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add SELinux access filter: %m");
+#endif
+
+ r = bus_add_implementation(bus, &bus_manager_object, m);
+ if (r < 0)
+ return r;
+
+ return bus_add_implementation(bus, &manager_log_control_object, m);
+}
+
+static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ "org.freedesktop.DBus.Local",
+ "/org/freedesktop/DBus/Local",
+ "org.freedesktop.DBus.Local",
+ "Disconnected",
+ signal_disconnected, NULL, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request match for Disconnected message: %m");
+
+ return 0;
+}
+
+static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_close_ int nfd = -1;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_id128_t id;
+ int r;
+
+ assert(s);
+
+ nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+ if (nfd < 0) {
+ if (ERRNO_IS_ACCEPT_AGAIN(errno))
+ return 0;
+
+ log_warning_errno(errno, "Failed to accept private connection, ignoring: %m");
+ return 0;
+ }
+
+ if (set_size(m->private_buses) >= CONNECTIONS_MAX) {
+ log_warning("Too many concurrent connections, refusing");
+ return 0;
+ }
+
+ r = sd_bus_new(&bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to allocate new private connection bus: %m");
+ return 0;
+ }
+
+ (void) sd_bus_set_description(bus, "private-bus-connection");
+
+ r = sd_bus_set_fd(bus, nfd, nfd);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set fd on new connection bus: %m");
+ return 0;
+ }
+
+ nfd = -1;
+
+ r = bus_check_peercred(bus);
+ if (r < 0) {
+ log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m");
+ return 0;
+ }
+
+ assert_se(sd_id128_randomize(&id) >= 0);
+
+ r = sd_bus_set_server(bus, 1, id);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to enable server support for new connection bus: %m");
+ return 0;
+ }
+
+ r = sd_bus_negotiate_creds(bus, 1,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+ SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+ SD_BUS_CREDS_SELINUX_CONTEXT);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to enable credentials for new connection: %m");
+ return 0;
+ }
+
+ r = sd_bus_set_sender(bus, "org.freedesktop.systemd1");
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set direct connection sender: %m");
+ return 0;
+ }
+
+ r = sd_bus_start(bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to start new connection bus: %m");
+ return 0;
+ }
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to attach new connection bus to event loop: %m");
+ return 0;
+ }
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return 0;
+
+ r = bus_setup_api_vtables(m, bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m");
+ return 0;
+ }
+
+ r = set_ensure_put(&m->private_buses, NULL, bus);
+ if (r == -ENOMEM) {
+ log_oom();
+ return 0;
+ }
+ if (r < 0) {
+ log_warning_errno(r, "Failed to add new connection bus to set: %m");
+ return 0;
+ }
+
+ TAKE_PTR(bus);
+
+ log_debug("Accepted new private connection.");
+
+ return 0;
+}
+
+static int bus_setup_api(Manager *m, sd_bus *bus) {
+ char *name;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */
+ r = sd_bus_negotiate_creds(bus, 1,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+ SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+ SD_BUS_CREDS_SELINUX_CONTEXT);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable credential passing, ignoring: %m");
+
+ r = bus_setup_api_vtables(m, bus);
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(u, name, m->watch_bus) {
+ r = unit_install_bus_match(u, bus, name);
+ if (r < 0)
+ log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+ }
+
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.systemd1.Activator",
+ "ActivationRequest",
+ signal_activation_request, NULL, m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to subscribe to activation signal: %m");
+
+ /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open
+ * until after the new connection is set up and the name installed to allow clients to synchronously wait for
+ * reexecution to finish */
+ r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request name: %m");
+
+ log_debug("Successfully connected to API bus.");
+
+ return 0;
+}
+
+int bus_init_api(Manager *m) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ if (m->api_bus)
+ return 0;
+
+ /* The API and system bus is the same if we are running in system mode */
+ if (MANAGER_IS_SYSTEM(m) && m->system_bus)
+ bus = sd_bus_ref(m->system_bus);
+ else {
+ if (MANAGER_IS_SYSTEM(m))
+ r = sd_bus_open_system_with_description(&bus, "bus-api-system");
+ else
+ r = sd_bus_open_user_with_description(&bus, "bus-api-user");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to API bus: %m");
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach API bus to event loop: %m");
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_setup_api(m, bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up API bus: %m");
+
+ m->api_bus = TAKE_PTR(bus);
+
+ return 0;
+}
+
+static int bus_setup_system(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ /* if we are a user instance we get the Released message via the system bus */
+ if (MANAGER_IS_USER(m)) {
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ NULL,
+ "/org/freedesktop/systemd1/agent",
+ "org.freedesktop.systemd1.Agent",
+ "Released",
+ signal_agent_released, NULL, m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to request Released match on system bus: %m");
+ }
+
+ log_debug("Successfully connected to system bus.");
+ return 0;
+}
+
+int bus_init_system(Manager *m) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ if (m->system_bus)
+ return 0;
+
+ /* The API and system bus is the same if we are running in system mode */
+ if (MANAGER_IS_SYSTEM(m) && m->api_bus)
+ bus = sd_bus_ref(m->api_bus);
+ else {
+ r = sd_bus_open_system_with_description(&bus, "bus-system");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to system bus: %m");
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_setup_system(m, bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up system bus: %m");
+
+ m->system_bus = TAKE_PTR(bus);
+
+ return 0;
+}
+
+int bus_init_private(Manager *m) {
+ _cleanup_close_ int fd = -1;
+ union sockaddr_union sa;
+ socklen_t sa_len;
+ sd_event_source *s;
+ int r;
+
+ assert(m);
+
+ if (m->private_listen_fd >= 0)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(m)) {
+
+ /* We want the private bus only when running as init */
+ if (getpid_cached() != 1)
+ return 0;
+
+ r = sockaddr_un_set_path(&sa.un, "/run/systemd/private");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ const char *e;
+
+ e = secure_getenv("XDG_RUNTIME_DIR");
+ if (!e)
+ return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+ "XDG_RUNTIME_DIR is not set, refusing.");
+
+ joined = path_join(e, "/systemd/private");
+ if (!joined)
+ return log_oom();
+
+ r = sockaddr_un_set_path(&sa.un, joined);
+ }
+ if (r < 0)
+ return log_error_errno(r, "Can't set path for AF_UNIX socket to bind to: %m");
+ sa_len = r;
+
+ (void) mkdir_parents_label(sa.un.sun_path, 0755);
+ (void) sockaddr_un_unlink(&sa.un);
+
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate private socket: %m");
+
+ RUN_WITH_UMASK(0077)
+ r = bind(fd, &sa.sa, sa_len);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to bind private socket: %m");
+
+ r = listen(fd, SOMAXCONN_DELUXE);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to make private socket listening: %m");
+
+ /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */
+ (void) touch(sa.un.sun_path);
+
+ r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate event source: %m");
+
+ (void) sd_event_source_set_description(s, "bus-connection");
+
+ m->private_listen_fd = TAKE_FD(fd);
+ m->private_listen_event_source = s;
+
+ log_debug("Successfully created private D-Bus server.");
+
+ return 0;
+}
+
+static void destroy_bus(Manager *m, sd_bus **bus) {
+ Unit *u;
+ Job *j;
+
+ assert(m);
+ assert(bus);
+
+ if (!*bus)
+ return;
+
+ /* Make sure all bus slots watching names are released. */
+ HASHMAP_FOREACH(u, m->watch_bus) {
+ if (u->match_bus_slot && sd_bus_slot_get_bus(u->match_bus_slot) == *bus)
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ if (u->get_name_owner_slot && sd_bus_slot_get_bus(u->get_name_owner_slot) == *bus)
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+ }
+
+ /* Get rid of tracked clients on this bus */
+ if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus)
+ m->subscribed = sd_bus_track_unref(m->subscribed);
+
+ HASHMAP_FOREACH(j, m->jobs)
+ if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus)
+ j->bus_track = sd_bus_track_unref(j->bus_track);
+
+ HASHMAP_FOREACH(u, m->units) {
+ if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus)
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+
+ /* Get rid of pending freezer messages on this bus */
+ if (u->pending_freezer_message && sd_bus_message_get_bus(u->pending_freezer_message) == *bus)
+ u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message);
+ }
+
+ /* Get rid of queued message on this bus */
+ if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus)
+ m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+ /* Possibly flush unwritten data, but only if we are
+ * unprivileged, since we don't want to sync here */
+ if (!MANAGER_IS_SYSTEM(m))
+ sd_bus_flush(*bus);
+
+ /* And destroy the object */
+ *bus = sd_bus_close_unref(*bus);
+}
+
+void bus_done_api(Manager *m) {
+ destroy_bus(m, &m->api_bus);
+}
+
+void bus_done_system(Manager *m) {
+ destroy_bus(m, &m->system_bus);
+}
+
+void bus_done_private(Manager *m) {
+ sd_bus *b;
+
+ assert(m);
+
+ while ((b = set_steal_first(m->private_buses)))
+ destroy_bus(m, &b);
+
+ m->private_buses = set_free(m->private_buses);
+
+ m->private_listen_event_source = sd_event_source_disable_unref(m->private_listen_event_source);
+ m->private_listen_fd = safe_close(m->private_listen_fd);
+}
+
+void bus_done(Manager *m) {
+ assert(m);
+
+ bus_done_api(m);
+ bus_done_system(m);
+ bus_done_private(m);
+
+ assert(!m->subscribed);
+
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+ bus_verify_polkit_async_registry_free(m->polkit_registry);
+}
+
+int bus_fdset_add_all(Manager *m, FDSet *fds) {
+ sd_bus *b;
+ int fd;
+
+ assert(m);
+ assert(fds);
+
+ /* When we are about to reexecute we add all D-Bus fds to the
+ * set to pass over to the newly executed systemd. They won't
+ * be used there however, except thatt they are closed at the
+ * very end of deserialization, those making it possible for
+ * clients to synchronously wait for systemd to reexec by
+ * simply waiting for disconnection */
+
+ if (m->api_bus) {
+ fd = sd_bus_get_fd(m->api_bus);
+ if (fd >= 0) {
+ fd = fdset_put_dup(fds, fd);
+ if (fd < 0)
+ return fd;
+ }
+ }
+
+ SET_FOREACH(b, m->private_buses) {
+ fd = sd_bus_get_fd(b);
+ if (fd >= 0) {
+ fd = fdset_put_dup(fds, fd);
+ if (fd < 0)
+ return fd;
+ }
+ }
+
+ /* We don't offer any APIs on the system bus (well, unless it
+ * is the same as the API bus) hence we don't bother with it
+ * here */
+
+ return 0;
+}
+
+int bus_foreach_bus(
+ Manager *m,
+ sd_bus_track *subscribed2,
+ int (*send_message)(sd_bus *bus, void *userdata),
+ void *userdata) {
+
+ sd_bus *b;
+ int r, ret = 0;
+
+ /* Send to all direct buses, unconditionally */
+ SET_FOREACH(b, m->private_buses) {
+
+ /* Don't bother with enqueuing these messages to clients that haven't started yet */
+ if (sd_bus_is_ready(b) <= 0)
+ continue;
+
+ r = send_message(b, userdata);
+ if (r < 0)
+ ret = r;
+ }
+
+ /* Send to API bus, but only if somebody is subscribed */
+ if (m->api_bus &&
+ (sd_bus_track_count(m->subscribed) > 0 ||
+ sd_bus_track_count(subscribed2) > 0)) {
+ r = send_message(m->api_bus, userdata);
+ if (r < 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) {
+ const char *n;
+
+ assert(f);
+ assert(prefix);
+
+ for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) {
+ int c, j;
+
+ c = sd_bus_track_count_name(t, n);
+ for (j = 0; j < c; j++)
+ (void) serialize_item(f, prefix, n);
+ }
+}
+
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) {
+ int r;
+
+ assert(m);
+ assert(t);
+
+ if (strv_isempty(l))
+ return 0;
+
+ if (!m->api_bus)
+ return 0;
+
+ if (!*t) {
+ r = sd_bus_track_new(m->api_bus, t, NULL, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_track_set_recursive(*t, recursive);
+ if (r < 0)
+ return r;
+
+ return bus_track_add_name_many(*t, l);
+}
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+uint64_t manager_bus_n_queued_write(Manager *m) {
+ uint64_t c = 0;
+ sd_bus *b;
+ int r;
+
+ /* Returns the total number of messages queued for writing on all our direct and API buses. */
+
+ SET_FOREACH(b, m->private_buses) {
+ uint64_t k;
+
+ r = sd_bus_get_n_queued_write(b, &k);
+ if (r < 0)
+ log_debug_errno(r, "Failed to query queued messages for private bus: %m");
+ else
+ c += k;
+ }
+
+ if (m->api_bus) {
+ uint64_t k;
+
+ r = sd_bus_get_n_queued_write(m->api_bus, &k);
+ if (r < 0)
+ log_debug_errno(r, "Failed to query queued messages for API bus: %m");
+ else
+ c += k;
+ }
+
+ return c;
+}
+
+static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) {
+ const sd_bus_vtable *i;
+
+ for (i = table; i->type != _SD_BUS_VTABLE_END; i++) {
+ if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) ||
+ (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0)
+ continue;
+
+ fprintf(f, "%s\n", i->x.property.member);
+ }
+}
+
+void dump_bus_properties(FILE *f) {
+ assert(f);
+
+ vtable_dump_bus_properties(f, bus_automount_vtable);
+ vtable_dump_bus_properties(f, bus_cgroup_vtable);
+ vtable_dump_bus_properties(f, bus_device_vtable);
+ vtable_dump_bus_properties(f, bus_exec_vtable);
+ vtable_dump_bus_properties(f, bus_job_vtable);
+ vtable_dump_bus_properties(f, bus_kill_vtable);
+ vtable_dump_bus_properties(f, bus_manager_vtable);
+ vtable_dump_bus_properties(f, bus_mount_vtable);
+ vtable_dump_bus_properties(f, bus_path_vtable);
+ vtable_dump_bus_properties(f, bus_scope_vtable);
+ vtable_dump_bus_properties(f, bus_service_vtable);
+ vtable_dump_bus_properties(f, bus_slice_vtable);
+ vtable_dump_bus_properties(f, bus_socket_vtable);
+ vtable_dump_bus_properties(f, bus_swap_vtable);
+ vtable_dump_bus_properties(f, bus_target_vtable);
+ vtable_dump_bus_properties(f, bus_timer_vtable);
+ vtable_dump_bus_properties(f, bus_unit_vtable);
+ vtable_dump_bus_properties(f, bus_unit_cgroup_vtable);
+}
diff --git a/src/core/dbus.h b/src/core/dbus.h
new file mode 100644
index 0000000..50e7bb4
--- /dev/null
+++ b/src/core/dbus.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int bus_send_pending_reload_message(Manager *m);
+
+int bus_init_private(Manager *m);
+int bus_init_api(Manager *m);
+int bus_init_system(Manager *m);
+
+void bus_done_private(Manager *m);
+void bus_done_api(Manager *m);
+void bus_done_system(Manager *m);
+void bus_done(Manager *m);
+
+int bus_fdset_add_all(Manager *m, FDSet *fds);
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix);
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l);
+
+int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata);
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+
+int bus_forward_agent_released(Manager *m, const char *path);
+
+uint64_t manager_bus_n_queued_write(Manager *m);
+
+void dump_bus_properties(FILE *f);
+int bus_manager_introspect_implementations(FILE *out, const char *pattern);
diff --git a/src/core/device.c b/src/core/device.c
new file mode 100644
index 0000000..8922f5d
--- /dev/null
+++ b/src/core/device.c
@@ -0,0 +1,1276 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "dbus-device.h"
+#include "dbus-unit.h"
+#include "device-private.h"
+#include "device-util.h"
+#include "device.h"
+#include "log.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "ratelimit.h"
+#include "serialize.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "swap.h"
+#include "udev-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = {
+ [DEVICE_DEAD] = UNIT_INACTIVE,
+ [DEVICE_TENTATIVE] = UNIT_ACTIVATING,
+ [DEVICE_PLUGGED] = UNIT_ACTIVE,
+};
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata);
+
+static int device_by_path(Manager *m, const char *path, Unit **ret) {
+ _cleanup_free_ char *e = NULL;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(path);
+
+ r = unit_name_from_path(path, ".device", &e);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit(m, e);
+ if (!u)
+ return -ENOENT;
+
+ if (ret)
+ *ret = u;
+ return 0;
+}
+
+static void device_unset_sysfs(Device *d) {
+ Hashmap *devices;
+ Device *first;
+
+ assert(d);
+
+ if (!d->sysfs)
+ return;
+
+ /* Remove this unit from the chain of devices which share the
+ * same sysfs path. */
+ devices = UNIT(d)->manager->devices_by_sysfs;
+ first = hashmap_get(devices, d->sysfs);
+ LIST_REMOVE(same_sysfs, first, d);
+
+ if (first)
+ hashmap_remove_and_replace(devices, d->sysfs, first->sysfs, first);
+ else
+ hashmap_remove(devices, d->sysfs);
+
+ d->sysfs = mfree(d->sysfs);
+}
+
+static int device_set_sysfs(Device *d, const char *sysfs) {
+ _cleanup_free_ char *copy = NULL;
+ Device *first;
+ int r;
+
+ assert(d);
+
+ if (streq_ptr(d->sysfs, sysfs))
+ return 0;
+
+ r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ copy = strdup(sysfs);
+ if (!copy)
+ return -ENOMEM;
+
+ device_unset_sysfs(d);
+
+ first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs);
+ LIST_PREPEND(same_sysfs, first, d);
+
+ r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first);
+ if (r < 0) {
+ LIST_REMOVE(same_sysfs, first, d);
+ return r;
+ }
+
+ d->sysfs = TAKE_PTR(copy);
+ unit_add_to_dbus_queue(UNIT(d));
+
+ return 0;
+}
+
+static void device_init(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(UNIT(d)->load_state == UNIT_STUB);
+
+ /* In contrast to all other unit types we timeout jobs waiting
+ * for devices by default. This is because they otherwise wait
+ * indefinitely for plugged in devices, something which cannot
+ * happen for the other units since their operations time out
+ * anyway. */
+ u->job_running_timeout = u->manager->default_device_timeout_usec;
+
+ u->ignore_on_isolate = true;
+
+ d->deserialized_state = _DEVICE_STATE_INVALID;
+}
+
+static void device_done(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+
+ device_unset_sysfs(d);
+ d->deserialized_sysfs = mfree(d->deserialized_sysfs);
+ d->wants_property = strv_free(d->wants_property);
+ d->path = mfree(d->path);
+}
+
+static int device_load(Unit *u) {
+ int r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (!u->description) {
+ /* Generate a description based on the path, to be used until the device is initialized
+ properly */
+ r = unit_name_to_path(u->id, &u->description);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to unescape name: %m");
+ }
+
+ return 0;
+}
+
+static void device_set_state(Device *d, DeviceState state) {
+ DeviceState old_state;
+
+ assert(d);
+
+ if (d->state != state)
+ bus_unit_send_pending_change_signal(UNIT(d), false);
+
+ old_state = d->state;
+ d->state = state;
+
+ if (state == DEVICE_DEAD)
+ device_unset_sysfs(d);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state));
+
+ unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) {
+ assert(d);
+
+ /* Didn't exist before, but does now? if so, generate a new invocation ID for it */
+ if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND)
+ (void) unit_acquire_invocation_id(UNIT(d));
+
+ if (FLAGS_SET(now, DEVICE_FOUND_UDEV))
+ /* When the device is known to udev we consider it plugged. */
+ device_set_state(d, DEVICE_PLUGGED);
+ else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV))
+ /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the
+ * kernel knows it now, and udev might soon too. */
+ device_set_state(d, DEVICE_TENTATIVE);
+ else
+ /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced
+ * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */
+ device_set_state(d, DEVICE_DEAD);
+}
+
+static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) {
+ assert(d);
+
+ if (MANAGER_IS_RUNNING(UNIT(d)->manager)) {
+ DeviceFound n, previous;
+
+ /* When we are already running, then apply the new mask right-away, and trigger state changes
+ * right-away */
+
+ n = (d->found & ~mask) | (found & mask);
+ if (n == d->found)
+ return;
+
+ previous = d->found;
+ d->found = n;
+
+ device_found_changed(d, previous, n);
+ } else
+ /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as
+ * soon as we catch-up with the state. */
+ d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask);
+}
+
+static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) {
+ Device *l;
+
+ assert(m);
+ assert(sysfs);
+
+ if (mask == 0)
+ return;
+
+ l = hashmap_get(m->devices_by_sysfs, sysfs);
+ LIST_FOREACH(same_sysfs, d, l)
+ device_update_found_one(d, found, mask);
+}
+
+static void device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) {
+ Unit *u;
+
+ assert(m);
+ assert(path);
+
+ if (mask == 0)
+ return;
+
+ if (device_by_path(m, path, &u) < 0)
+ return;
+
+ device_update_found_one(DEVICE(u), found, mask);
+}
+
+static int device_coldplug(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(d->state == DEVICE_DEAD);
+
+ /* First, let's put the deserialized state and found mask into effect, if we have it. */
+ if (d->deserialized_state < 0)
+ return 0;
+
+ Manager *m = u->manager;
+ DeviceFound found = d->deserialized_found;
+ DeviceState state = d->deserialized_state;
+
+ /* On initial boot, switch-root, reload, reexecute, the following happen:
+ * 1. MANAGER_IS_RUNNING() == false
+ * 2. enumerate devices: manager_enumerate() -> device_enumerate()
+ * Device.enumerated_found is set.
+ * 3. deserialize devices: manager_deserialize() -> device_deserialize_item()
+ * Device.deserialize_state and Device.deserialized_found are set.
+ * 4. coldplug devices: manager_coldplug() -> device_coldplug()
+ * deserialized properties are copied to the main properties.
+ * 5. MANAGER_IS_RUNNING() == true: manager_ready()
+ * 6. catchup devices: manager_catchup() -> device_catchup()
+ * Device.enumerated_found is applied to Device.found, and state is updated based on that.
+ *
+ * Notes:
+ * - On initial boot, no udev database exists. Hence, no devices are enumerated in the step 2.
+ * Also, there is no deserialized device. Device units are (a) generated based on dependencies of
+ * other units, or (b) generated when uevents are received.
+ *
+ * - On switch-root, the udev database may be cleared, except for devices with sticky bit, i.e.
+ * OPTIONS="db_persist". Hence, almost no devices are enumerated in the step 2. However, in
+ * general, we have several serialized devices. So, DEVICE_FOUND_UDEV bit in the
+ * Device.deserialized_found must be ignored, as udev rules in initrd and the main system are often
+ * different. If the deserialized state is DEVICE_PLUGGED, we need to downgrade it to
+ * DEVICE_TENTATIVE. Unlike the other starting mode, MANAGER_IS_SWITCHING_ROOT() is true when
+ * device_coldplug() and device_catchup() are called. Hence, let's conditionalize the operations by
+ * using the flag. After switch-root, systemd-udevd will (re-)process all devices, and the
+ * Device.found and Device.state will be adjusted.
+ *
+ * - On reload or reexecute, we can trust Device.enumerated_found, Device.deserialized_found, and
+ * Device.deserialized_state. Of course, deserialized parameters may be outdated, but the unit
+ * state can be adjusted later by device_catchup() or uevents. */
+
+ if (MANAGER_IS_SWITCHING_ROOT(m) &&
+ !FLAGS_SET(d->enumerated_found, DEVICE_FOUND_UDEV)) {
+
+ /* The device has not been enumerated. On switching-root, such situation is natural. See the
+ * above comment. To prevent problematic state transition active → dead → active, let's
+ * drop the DEVICE_FOUND_UDEV flag and downgrade state to DEVICE_TENTATIVE(activating). See
+ * issue #12953 and #23208. */
+ found &= ~DEVICE_FOUND_UDEV;
+ if (state == DEVICE_PLUGGED)
+ state = DEVICE_TENTATIVE;
+
+ /* Also check the validity of the device syspath. Without this check, if the device was
+ * removed while switching root, it would never go to inactive state, as both Device.found
+ * and Device.enumerated_found do not have the DEVICE_FOUND_UDEV flag, so device_catchup() in
+ * device_update_found_one() does nothing in most cases. See issue #25106. Note that the
+ * syspath field is only serialized when systemd is sufficiently new and the device has been
+ * already processed by udevd. */
+ if (d->deserialized_sysfs) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+ if (sd_device_new_from_syspath(&dev, d->deserialized_sysfs) < 0)
+ state = DEVICE_DEAD;
+ }
+ }
+
+ if (d->found == found && d->state == state)
+ return 0;
+
+ d->found = found;
+ device_set_state(d, state);
+ return 0;
+}
+
+static void device_catchup(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+
+ /* Second, let's update the state with the enumerated state */
+ device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK);
+}
+
+static const struct {
+ DeviceFound flag;
+ const char *name;
+} device_found_map[] = {
+ { DEVICE_FOUND_UDEV, "found-udev" },
+ { DEVICE_FOUND_MOUNT, "found-mount" },
+ { DEVICE_FOUND_SWAP, "found-swap" },
+};
+
+static int device_found_to_string_many(DeviceFound flags, char **ret) {
+ _cleanup_free_ char *s = NULL;
+
+ assert(ret);
+
+ for (size_t i = 0; i < ELEMENTSOF(device_found_map); i++) {
+ if (!FLAGS_SET(flags, device_found_map[i].flag))
+ continue;
+
+ if (!strextend_with_separator(&s, ",", device_found_map[i].name))
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(s);
+
+ return 0;
+}
+
+static int device_found_from_string_many(const char *name, DeviceFound *ret) {
+ DeviceFound flags = 0;
+ int r;
+
+ assert(ret);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ DeviceFound f = 0;
+ unsigned i;
+
+ r = extract_first_word(&name, &word, ",", 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (i = 0; i < ELEMENTSOF(device_found_map); i++)
+ if (streq(word, device_found_map[i].name)) {
+ f = device_found_map[i].flag;
+ break;
+ }
+
+ if (f == 0)
+ return -EINVAL;
+
+ flags |= f;
+ }
+
+ *ret = flags;
+ return 0;
+}
+
+static int device_serialize(Unit *u, FILE *f, FDSet *fds) {
+ _cleanup_free_ char *s = NULL;
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ if (d->sysfs)
+ (void) serialize_item(f, "sysfs", d->sysfs);
+
+ if (d->path)
+ (void) serialize_item(f, "path", d->path);
+
+ (void) serialize_item(f, "state", device_state_to_string(d->state));
+
+ if (device_found_to_string_many(d->found, &s) >= 0)
+ (void) serialize_item(f, "found", s);
+
+ return 0;
+}
+
+static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Device *d = DEVICE(u);
+ int r;
+
+ assert(d);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "sysfs")) {
+ if (!d->deserialized_sysfs) {
+ d->deserialized_sysfs = strdup(value);
+ if (!d->deserialized_sysfs)
+ log_oom_debug();
+ }
+
+ } else if (streq(key, "path")) {
+ if (!d->path) {
+ d->path = strdup(value);
+ if (!d->path)
+ log_oom_debug();
+ }
+
+ } else if (streq(key, "state")) {
+ DeviceState state;
+
+ state = device_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value, ignoring: %s", value);
+ else
+ d->deserialized_state = state;
+
+ } else if (streq(key, "found")) {
+ r = device_found_from_string_many(value, &d->deserialized_found);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value);
+
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void device_dump(Unit *u, FILE *f, const char *prefix) {
+ Device *d = DEVICE(u);
+ _cleanup_free_ char *s = NULL;
+
+ assert(d);
+
+ (void) device_found_to_string_many(d->found, &s);
+
+ fprintf(f,
+ "%sDevice State: %s\n"
+ "%sDevice Path: %s\n"
+ "%sSysfs Path: %s\n"
+ "%sFound: %s\n",
+ prefix, device_state_to_string(d->state),
+ prefix, strna(d->path),
+ prefix, strna(d->sysfs),
+ prefix, strna(s));
+
+ STRV_FOREACH(i, d->wants_property)
+ fprintf(f, "%sudev SYSTEMD_WANTS: %s\n",
+ prefix, *i);
+}
+
+_pure_ static UnitActiveState device_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[DEVICE(u)->state];
+}
+
+_pure_ static const char *device_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return device_state_to_string(DEVICE(u)->state);
+}
+
+static int device_update_description(Unit *u, sd_device *dev, const char *path) {
+ _cleanup_free_ char *j = NULL;
+ const char *model, *label, *desc;
+ int r;
+
+ assert(u);
+ assert(path);
+
+ desc = path;
+
+ if (dev &&
+ (sd_device_get_property_value(dev, "ID_MODEL_FROM_DATABASE", &model) >= 0 ||
+ sd_device_get_property_value(dev, "ID_MODEL", &model) >= 0)) {
+ desc = model;
+
+ /* Try to concatenate the device model string with a label, if there is one */
+ if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 ||
+ sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 ||
+ sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) {
+
+ desc = j = strjoin(model, " ", label);
+ if (!j)
+ return log_oom();
+ }
+ }
+
+ r = unit_set_description(u, desc);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set device description: %m");
+
+ return 0;
+}
+
+static int device_add_udev_wants(Unit *u, sd_device *dev) {
+ _cleanup_strv_free_ char **added = NULL;
+ const char *wants, *property;
+ Device *d = DEVICE(u);
+ int r;
+
+ assert(d);
+ assert(dev);
+
+ property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS";
+
+ r = sd_device_get_property_value(dev, property, &wants);
+ if (r < 0)
+ return 0;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&wants, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants);
+
+ if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) {
+ _cleanup_free_ char *escaped = NULL;
+
+ /* If the unit name is specified as template, then automatically fill in the sysfs path of the
+ * device as instance name, properly escaped. */
+
+ r = unit_name_path_escape(d->sysfs, &escaped);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs);
+
+ r = unit_name_replace_instance(word, escaped, &k);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word);
+ } else {
+ /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */
+
+ r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word);
+ }
+
+ r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m");
+
+ r = strv_consume(&added, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (d->state != DEVICE_DEAD)
+ /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not
+ * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device
+ * changes while the device unit is already up, let's skip to trigger units that were already listed
+ * and are active, and start units otherwise. This typically happens during the boot-time switch root
+ * transition, as udev devices will generally already be up in the initrd, but SYSTEMD_WANTS properties
+ * get then added through udev rules only available on the host system, and thus only when the initial
+ * udev coldplug trigger runs.
+ *
+ * We do this only if the device has been up already when we parse this, as otherwise the usual
+ * dependency logic that is run from the dead → plugged transition will trigger these deps. */
+ STRV_FOREACH(i, added) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ if (strv_contains(d->wants_property, *i)) {
+ Unit *v;
+
+ v = manager_get_unit(u->manager, *i);
+ if (v && UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(v)))
+ continue; /* The unit was already listed and is running. */
+ }
+
+ r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_full_errno(u, sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_UNIT) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to enqueue %s job, ignoring: %s", property, bus_error_message(&error, r));
+ }
+
+ return strv_free_and_replace(d->wants_property, added);
+}
+
+static bool device_is_bound_by_mounts(Device *d, sd_device *dev) {
+ int r;
+
+ assert(d);
+ assert(dev);
+
+ r = device_get_property_bool(dev, "SYSTEMD_MOUNT_DEVICE_BOUND");
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND= udev property, ignoring: %m");
+
+ d->bind_mounts = r > 0;
+
+ return d->bind_mounts;
+}
+
+static void device_upgrade_mount_deps(Unit *u) {
+ Unit *other;
+ void *v;
+ int r;
+
+ /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */
+
+ HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) {
+ if (other->type != UNIT_MOUNT)
+ continue;
+
+ r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m");
+ }
+}
+
+static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main, Set **units) {
+ _cleanup_(unit_freep) Unit *new_unit = NULL;
+ _cleanup_free_ char *e = NULL;
+ const char *sysfs = NULL;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(path);
+
+ if (dev) {
+ r = sd_device_get_syspath(dev, &sysfs);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+ }
+
+ r = unit_name_from_path(path, ".device", &e);
+ if (r < 0)
+ return log_struct_errno(
+ LOG_WARNING, r,
+ "MESSAGE_ID=" SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE_STR,
+ "DEVICE=%s", path,
+ LOG_MESSAGE("Failed to generate valid unit name from device path '%s', ignoring device: %m",
+ path));
+
+ u = manager_get_unit(m, e);
+ if (u) {
+ /* The device unit can still be present even if the device was unplugged: a mount unit can reference it
+ * hence preventing the GC to have garbaged it. That's desired since the device unit may have a
+ * dependency on the mount unit which was added during the loading of the later. When the device is
+ * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not
+ * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we
+ * might have the state valid, but not the sysfs path. Also, there is another possibility; when multiple
+ * devices have the same devlink (e.g. /dev/disk/by-uuid/xxxx), adding/updating/removing one of the
+ * device causes syspath change. Hence, let's always update sysfs path.*/
+
+ /* Let's remove all dependencies generated due to udev properties. We'll re-add whatever is configured
+ * now below. */
+ unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV);
+
+ } else {
+ r = unit_new_for_name(m, sizeof(Device), e, &new_unit);
+ if (r < 0)
+ return log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e);
+
+ u = new_unit;
+
+ unit_add_to_load_queue(u);
+ }
+
+ if (!DEVICE(u)->path) {
+ DEVICE(u)->path = strdup(path);
+ if (!DEVICE(u)->path)
+ return log_oom();
+ }
+
+ /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be
+ * initialized. Hence initialize it if necessary. */
+ if (sysfs) {
+ r = device_set_sysfs(DEVICE(u), sysfs);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs);
+
+ /* The additional systemd udev properties we only interpret for the main object */
+ if (main)
+ (void) device_add_udev_wants(u, dev);
+ }
+
+ (void) device_update_description(u, dev, path);
+
+ /* So the user wants the mount units to be bound to the device but a mount unit might has been seen
+ * by systemd before the device appears on its radar. In this case the device unit is partially
+ * initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't
+ * present. Fix this up now. */
+ if (dev && device_is_bound_by_mounts(DEVICE(u), dev))
+ device_upgrade_mount_deps(u);
+
+ if (units) {
+ r = set_ensure_put(units, NULL, DEVICE(u));
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to store unit: %m");
+ }
+
+ TAKE_PTR(new_unit);
+ return 0;
+}
+
+static bool device_is_ready(sd_device *dev) {
+ int r;
+
+ assert(dev);
+
+ if (device_for_action(dev, SD_DEVICE_REMOVE))
+ return false;
+
+ r = device_is_renaming(dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to check if device is renaming, assuming device is not renaming: %m");
+ if (r > 0) {
+ log_device_debug(dev, "Device busy: device is renaming");
+ return false;
+ }
+
+ /* Is it really tagged as 'systemd' right now? */
+ r = sd_device_has_current_tag(dev, "systemd");
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to check if device has \"systemd\" tag, assuming device is not tagged with \"systemd\": %m");
+ if (r == 0)
+ log_device_debug(dev, "Device busy: device is not tagged with \"systemd\"");
+ if (r <= 0)
+ return false;
+
+ r = device_get_property_bool(dev, "SYSTEMD_READY");
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to get device SYSTEMD_READY property, assuming device does not have \"SYSTEMD_READY\" property: %m");
+ if (r == 0)
+ log_device_debug(dev, "Device busy: SYSTEMD_READY property from device is false");
+
+ return r != 0;
+}
+
+static int device_setup_devlink_unit_one(Manager *m, const char *devlink, Set **ready_units, Set **not_ready_units) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ Unit *u;
+
+ assert(m);
+ assert(devlink);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ if (sd_device_new_from_devname(&dev, devlink) >= 0 && device_is_ready(dev))
+ return device_setup_unit(m, dev, devlink, /* main = */ false, ready_units);
+
+ /* the devlink is already removed or not ready */
+ if (device_by_path(m, devlink, &u) < 0)
+ return 0; /* The corresponding .device unit not found. That's fine. */
+
+ return set_ensure_put(not_ready_units, NULL, DEVICE(u));
+}
+
+static int device_setup_extra_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+ _cleanup_strv_free_ char **aliases = NULL;
+ const char *devlink, *syspath, *devname = NULL;
+ Device *l;
+ int r;
+
+ assert(m);
+ assert(dev);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ r = sd_device_get_syspath(dev, &syspath);
+ if (r < 0)
+ return r;
+
+ (void) sd_device_get_devname(dev, &devname);
+
+ /* devlink units */
+ FOREACH_DEVICE_DEVLINK(dev, devlink) {
+ /* These are a kind of special devlink. They should be always unique, but neither persistent
+ * nor predictable. Hence, let's refuse them. See also the comments for alias units below. */
+ if (PATH_STARTSWITH_SET(devlink, "/dev/block/", "/dev/char/"))
+ continue;
+
+ (void) device_setup_devlink_unit_one(m, devlink, ready_units, not_ready_units);
+ }
+
+ if (device_is_ready(dev)) {
+ const char *s;
+
+ r = sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &s);
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to get SYSTEMD_ALIAS property, ignoring: %m");
+ if (r >= 0) {
+ r = strv_split_full(&aliases, s, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property, ignoring: %m");
+ }
+ }
+
+ /* alias units */
+ STRV_FOREACH(alias, aliases) {
+ if (!path_is_absolute(*alias)) {
+ log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not an absolute path, ignoring.", *alias);
+ continue;
+ }
+
+ if (!path_is_safe(*alias)) {
+ log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not safe, ignoring.", *alias);
+ continue;
+ }
+
+ /* Note, even if the devlink is not persistent, LVM expects /dev/block/ symlink units exist.
+ * To achieve that, they set the path to SYSTEMD_ALIAS. Hence, we cannot refuse aliases start
+ * with /dev/, unfortunately. */
+
+ (void) device_setup_unit(m, dev, *alias, /* main = */ false, ready_units);
+ }
+
+ l = hashmap_get(m->devices_by_sysfs, syspath);
+ LIST_FOREACH(same_sysfs, d, l) {
+ if (!d->path)
+ continue;
+
+ if (path_equal(d->path, syspath))
+ continue; /* This is the main unit. */
+
+ if (devname && path_equal(d->path, devname))
+ continue; /* This is the real device node. */
+
+ if (device_has_devlink(dev, d->path))
+ continue; /* The devlink was already processed in the above loop. */
+
+ if (strv_contains(aliases, d->path))
+ continue; /* This is already processed in the above, and ready. */
+
+ if (path_startswith(d->path, "/dev/"))
+ /* This is a devlink unit. Check existence and update syspath. */
+ (void) device_setup_devlink_unit_one(m, d->path, ready_units, not_ready_units);
+ else
+ /* This is an alias unit of dropped or not ready device. */
+ (void) set_ensure_put(not_ready_units, NULL, d);
+ }
+
+ return 0;
+}
+
+static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+ const char *syspath, *devname = NULL;
+ int r;
+
+ assert(m);
+ assert(dev);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ r = sd_device_get_syspath(dev, &syspath);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+
+ /* First, process the main (that is, points to the syspath) and (real, not symlink) devnode units. */
+ if (device_for_action(dev, SD_DEVICE_REMOVE))
+ /* If the device is removed, the main and devnode units units will be removed by
+ * device_update_found_by_sysfs() in device_dispatch_io(). Hence, it is not necessary to
+ * store them to not_ready_units, and we have nothing to do here.
+ *
+ * Note, still we need to process devlink units below, as a devlink previously points to this
+ * device may still exist and now point to another device node. That is, do not forget to
+ * call device_setup_extra_units(). */
+ ;
+ else if (device_is_ready(dev)) {
+ /* Add the main unit named after the syspath. If this one fails, don't bother with the rest,
+ * as this one shall be the main device unit the others just follow. (Compare with how
+ * device_following() is implemented, see below, which looks for the sysfs device.) */
+ r = device_setup_unit(m, dev, syspath, /* main = */ true, ready_units);
+ if (r < 0)
+ return r;
+
+ /* Add an additional unit for the device node */
+ if (sd_device_get_devname(dev, &devname) >= 0)
+ (void) device_setup_unit(m, dev, devname, /* main = */ false, ready_units);
+
+ } else {
+ Unit *u;
+
+ /* If the device exists but not ready, then save the units and unset udev bits later. */
+
+ if (device_by_path(m, syspath, &u) >= 0) {
+ r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+ }
+
+ if (sd_device_get_devname(dev, &devname) >= 0 &&
+ device_by_path(m, devname, &u) >= 0) {
+ r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+ }
+ }
+
+ /* Next, add/update additional .device units point to aliases and symlinks. */
+ (void) device_setup_extra_units(m, dev, ready_units, not_ready_units);
+
+ /* Safety check: no unit should be in ready_units and not_ready_units simultaneously. */
+ Unit *u;
+ SET_FOREACH(u, *not_ready_units)
+ if (set_remove(*ready_units, u))
+ log_unit_error(u, "Cannot activate and deactivate the unit simultaneously. Deactivating.");
+
+ return 0;
+}
+
+static Unit *device_following(Unit *u) {
+ Device *d = DEVICE(u);
+ Device *first = NULL;
+
+ assert(d);
+
+ if (startswith(u->id, "sys-"))
+ return NULL;
+
+ /* Make everybody follow the unit that's named after the sysfs path */
+ LIST_FOREACH(same_sysfs, other, d->same_sysfs_next)
+ if (startswith(UNIT(other)->id, "sys-"))
+ return UNIT(other);
+
+ LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+ if (startswith(UNIT(other)->id, "sys-"))
+ return UNIT(other);
+
+ first = other;
+ }
+
+ return UNIT(first);
+}
+
+static int device_following_set(Unit *u, Set **_set) {
+ Device *d = DEVICE(u);
+ _cleanup_set_free_ Set *set = NULL;
+ int r;
+
+ assert(d);
+ assert(_set);
+
+ if (LIST_JUST_US(same_sysfs, d)) {
+ *_set = NULL;
+ return 0;
+ }
+
+ set = set_new(NULL);
+ if (!set)
+ return -ENOMEM;
+
+ LIST_FOREACH(same_sysfs, other, d->same_sysfs_next) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ *_set = TAKE_PTR(set);
+ return 1;
+}
+
+static void device_shutdown(Manager *m) {
+ assert(m);
+
+ m->device_monitor = sd_device_monitor_unref(m->device_monitor);
+ m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs);
+}
+
+static void device_enumerate(Manager *m) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ sd_device *dev;
+ int r;
+
+ assert(m);
+
+ if (!m->device_monitor) {
+ r = sd_device_monitor_new(&m->device_monitor);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate device monitor: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd");
+ if (r < 0) {
+ log_error_errno(r, "Failed to add udev tag match: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_attach_event(m->device_monitor, m->event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to attach event to device monitor: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to start device monitor: %m");
+ goto fail;
+ }
+ }
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate device enumerator: %m");
+ goto fail;
+ }
+
+ r = sd_device_enumerator_add_match_tag(e, "systemd");
+ if (r < 0) {
+ log_error_errno(r, "Failed to set tag for device enumeration: %m");
+ goto fail;
+ }
+
+ FOREACH_DEVICE(e, dev) {
+ _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+ Device *d;
+
+ if (device_setup_units(m, dev, &ready_units, &not_ready_units) < 0)
+ continue;
+
+ SET_FOREACH(d, ready_units)
+ device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+ SET_FOREACH(d, not_ready_units)
+ device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+ }
+
+ return;
+
+fail:
+ device_shutdown(m);
+}
+
+static void device_propagate_reload(Manager *m, Device *d) {
+ int r;
+
+ assert(m);
+ assert(d);
+
+ if (d->state == DEVICE_DEAD)
+ return;
+
+ r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(d), r, "Failed to propagate reload, ignoring: %m");
+}
+
+static void device_remove_old_on_move(Manager *m, sd_device *dev) {
+ _cleanup_free_ char *syspath_old = NULL;
+ const char *devpath_old;
+ int r;
+
+ assert(m);
+ assert(dev);
+
+ r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old);
+ if (r < 0)
+ return (void) log_device_debug_errno(dev, r, "Failed to get DEVPATH_OLD= property on 'move' uevent, ignoring: %m");
+
+ syspath_old = path_join("/sys", devpath_old);
+ if (!syspath_old)
+ return (void) log_oom();
+
+ device_update_found_by_sysfs(m, syspath_old, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+}
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) {
+ _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_device_action_t action;
+ const char *sysfs;
+ bool ready;
+ Device *d;
+ int r;
+
+ assert(dev);
+
+ log_device_uevent(dev, "Processing udev action");
+
+ r = sd_device_get_syspath(dev, &sysfs);
+ if (r < 0) {
+ log_device_error_errno(dev, r, "Failed to get device syspath, ignoring: %m");
+ return 0;
+ }
+
+ r = sd_device_get_action(dev, &action);
+ if (r < 0) {
+ log_device_error_errno(dev, r, "Failed to get udev action, ignoring: %m");
+ return 0;
+ }
+
+ if (action == SD_DEVICE_MOVE)
+ device_remove_old_on_move(m, dev);
+
+ /* A change event can signal that a device is becoming ready, in particular if the device is using
+ * the SYSTEMD_READY logic in udev so we need to reach the else block of the following if, even for
+ * change events */
+ ready = device_is_ready(dev);
+
+ (void) device_setup_units(m, dev, &ready_units, &not_ready_units);
+
+ if (action == SD_DEVICE_REMOVE) {
+ r = swap_process_device_remove(m, dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m");
+ } else if (ready) {
+ r = swap_process_device_new(m, dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m");
+ }
+
+ if (!IN_SET(action, SD_DEVICE_ADD, SD_DEVICE_REMOVE, SD_DEVICE_MOVE))
+ SET_FOREACH(d, ready_units)
+ device_propagate_reload(m, d);
+
+ if (!set_isempty(ready_units))
+ manager_dispatch_load_queue(m);
+
+ if (action == SD_DEVICE_REMOVE)
+ /* If we get notified that a device was removed by udev, then it's completely gone, hence
+ * unset all found bits. Note this affects all .device units still point to the removed
+ * device. */
+ device_update_found_by_sysfs(m, sysfs, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+
+ /* These devices are found and ready now, set the udev found bit. Note, this is also necessary to do
+ * on remove uevent, as some devlinks may be updated and now point to other device nodes. */
+ SET_FOREACH(d, ready_units)
+ device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+
+ /* These devices may be nominally around, but not ready for us. Hence unset the udev bit, but leave
+ * the rest around. This may be redundant for remove uevent, but should be harmless. */
+ SET_FOREACH(d, not_ready_units)
+ device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+
+ return 0;
+}
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) {
+ int r;
+
+ assert(m);
+ assert(node);
+ assert(!FLAGS_SET(mask, DEVICE_FOUND_UDEV));
+
+ if (!udev_available())
+ return;
+
+ if (mask == 0)
+ return;
+
+ /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might
+ * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about
+ * it yet. In this case we will set the device unit to "tentative" state.
+ *
+ * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which
+ * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set
+ * and unset individual bits in a single call, while merging partially with previous state. */
+
+ if ((found & mask) != 0) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+ /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it,
+ * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if
+ * everything is alright with the device node. Note that we're fine with missing device nodes,
+ * but not with badly set up ones. */
+
+ r = sd_device_new_from_devname(&dev, node);
+ if (r == -ENODEV)
+ log_debug("Could not find device for %s, continuing without device node", node);
+ else if (r < 0) {
+ /* Reduce log noise from nodes which are not device nodes by skipping EINVAL. */
+ if (r != -EINVAL)
+ log_error_errno(r, "Failed to open %s device, ignoring: %m", node);
+ return;
+ }
+
+ (void) device_setup_unit(m, dev, node, /* main = */ false, NULL); /* 'dev' may be NULL. */
+ }
+
+ /* Update the device unit's state, should it exist */
+ (void) device_update_found_by_name(m, node, found, mask);
+}
+
+bool device_shall_be_bound_by(Unit *device, Unit *u) {
+ assert(device);
+ assert(u);
+
+ if (u->type != UNIT_MOUNT)
+ return false;
+
+ return DEVICE(device)->bind_mounts;
+}
+
+const UnitVTable device_vtable = {
+ .object_size = sizeof(Device),
+ .sections =
+ "Unit\0"
+ "Device\0"
+ "Install\0",
+
+ .gc_jobs = true,
+
+ .init = device_init,
+ .done = device_done,
+ .load = device_load,
+
+ .coldplug = device_coldplug,
+ .catchup = device_catchup,
+
+ .serialize = device_serialize,
+ .deserialize_item = device_deserialize_item,
+
+ .dump = device_dump,
+
+ .active_state = device_active_state,
+ .sub_state_to_string = device_sub_state_to_string,
+
+ .following = device_following,
+ .following_set = device_following_set,
+
+ .enumerate = device_enumerate,
+ .shutdown = device_shutdown,
+ .supported = udev_available,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Expecting device %s...",
+ [1] = "Waiting for device %s to disappear...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Found device %s.",
+ [JOB_TIMEOUT] = "Timed out waiting for device %s.",
+ },
+ },
+};
diff --git a/src/core/device.h b/src/core/device.h
new file mode 100644
index 0000000..9dd6fb5
--- /dev/null
+++ b/src/core/device.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Device Device;
+
+/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up
+ * asynchronously from each other at various places. For example, in very common case a device might already be mounted
+ * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it
+ * in quick succession). Hence we need to track precisely where it is already visible and where not. */
+typedef enum DeviceFound {
+ DEVICE_NOT_FOUND = 0,
+ DEVICE_FOUND_UDEV = 1 << 0, /* The device has shown up in the udev database */
+ DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */
+ DEVICE_FOUND_SWAP = 1 << 2, /* The device has shown up in /proc/swaps */
+ DEVICE_FOUND_MASK = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP,
+} DeviceFound;
+
+struct Device {
+ Unit meta;
+
+ char *sysfs, *deserialized_sysfs;
+ char *path; /* syspath, device node, alias, or devlink */
+
+ /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple
+ * devices for the same sysfs path. We chain them up here. */
+ LIST_FIELDS(struct Device, same_sysfs);
+
+ DeviceState state, deserialized_state;
+ DeviceFound found, deserialized_found, enumerated_found;
+
+ bool bind_mounts;
+
+ /* The SYSTEMD_WANTS udev property for this device the last time we saw it */
+ char **wants_property;
+};
+
+extern const UnitVTable device_vtable;
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask);
+bool device_shall_be_bound_by(Unit *device, Unit *u);
+
+DEFINE_CAST(DEVICE, Device);
diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c
new file mode 100644
index 0000000..18b36e7
--- /dev/null
+++ b/src/core/dynamic-user.c
@@ -0,0 +1,826 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "clean-ipc.h"
+#include "dynamic-user.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "io-util.h"
+#include "nscd-flush.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "socket-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */
+#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN)
+
+DEFINE_PRIVATE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user);
+
+static DynamicUser* dynamic_user_free(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ if (d->manager)
+ (void) hashmap_remove(d->manager->dynamic_users, d->name);
+
+ safe_close_pair(d->storage_socket);
+ return mfree(d);
+}
+
+static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) {
+ DynamicUser *d;
+ int r;
+
+ assert(m);
+ assert(name);
+ assert(storage_socket);
+
+ r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1);
+ if (!d)
+ return -ENOMEM;
+
+ strcpy(d->name, name);
+
+ d->storage_socket[0] = storage_socket[0];
+ d->storage_socket[1] = storage_socket[1];
+
+ r = hashmap_put(m->dynamic_users, d->name, d);
+ if (r < 0) {
+ free(d);
+ return r;
+ }
+
+ d->manager = m;
+
+ if (ret)
+ *ret = d;
+
+ return 0;
+}
+
+static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) {
+ _cleanup_close_pair_ int storage_socket[2] = { -1, -1 };
+ DynamicUser *d;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for
+ * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really
+ * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do
+ * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous
+ * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the
+ * allocated UID number, plus an fd referencing the lock file for the UID
+ * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can
+ * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair
+ * may exist in three different states:
+ *
+ * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized.
+ *
+ * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a
+ * statically assigned UID by the same name, which we are reusing.
+ *
+ * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic
+ * UID and locked it in the file system, using the lock fd.
+ *
+ * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it
+ * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here:
+ * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in
+ * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX
+ * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous,
+ * nobody else could get any access to it except via our own fd) and we want to synchronize access between all
+ * processes that have access to it. */
+
+ d = hashmap_get(m->dynamic_users, name);
+ if (d) {
+ if (ret) {
+ /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */
+ d->n_ref++;
+ *ret = d;
+ }
+ return 0;
+ }
+
+ if (!valid_user_group_name(name, VALID_USER_ALLOW_NUMERIC))
+ return -EINVAL;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0)
+ return -errno;
+
+ r = dynamic_user_add(m, name, storage_socket, &d);
+ if (r < 0)
+ return r;
+
+ storage_socket[0] = storage_socket[1] = -1;
+
+ if (ret) {
+ d->n_ref++;
+ *ret = d;
+ }
+
+ return 1;
+}
+
+static int make_uid_symlinks(uid_t uid, const char *name, bool b) {
+
+ char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1];
+ const char *path2;
+ int r = 0, k;
+
+ /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The
+ * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it
+ * would be its own client then). We hence keep these world-readable symlinks in place, so that the
+ * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go
+ * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks
+ * on them and as those may be taken by any user with read access we can't make them world-readable. */
+
+ xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid);
+ if (unlink(path1) < 0 && errno != ENOENT)
+ r = -errno;
+
+ if (b && symlink(name, path1) < 0) {
+ k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1);
+ if (r == 0)
+ r = k;
+ }
+
+ path2 = strjoina("/run/systemd/dynamic-uid/direct:", name);
+ if (unlink(path2) < 0 && errno != ENOENT) {
+ k = -errno;
+ if (r == 0)
+ r = k;
+ }
+
+ if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) {
+ k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path2);
+ if (r == 0)
+ r = k;
+ }
+
+ return r;
+}
+
+static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) {
+
+ /* Find a suitable free UID. We use the following strategy to find a suitable UID:
+ *
+ * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use
+ * them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or
+ * LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to
+ * recursively chown() them to new users.
+ *
+ * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be
+ * pretty good, as the use ris by default derived from the unit name, and hence the same service and same
+ * user should usually get the same UID as long as our hashing doesn't clash.
+ *
+ * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty.
+ *
+ * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */
+
+ enum {
+ PHASE_SUGGESTED, /* the first phase, reusing directory ownership UIDs */
+ PHASE_HASHED, /* the second phase, deriving a UID from the username by hashing */
+ PHASE_RANDOM, /* the last phase, randomly picking UIDs */
+ } phase = PHASE_SUGGESTED;
+
+ static const uint8_t hash_key[] = {
+ 0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5,
+ 0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59
+ };
+
+ unsigned n_tries = 100, current_suggested = 0;
+ int r;
+
+ (void) mkdir("/run/systemd/dynamic-uid", 0755);
+
+ for (;;) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+ _cleanup_close_ int lock_fd = -1;
+ uid_t candidate;
+ ssize_t l;
+
+ if (--n_tries <= 0) /* Give up retrying eventually */
+ return -EBUSY;
+
+ switch (phase) {
+
+ case PHASE_SUGGESTED: {
+ struct stat st;
+
+ if (!suggested_paths || !suggested_paths[current_suggested]) {
+ /* We reached the end of the suggested paths list, let's try by hashing the name */
+ phase = PHASE_HASHED;
+ continue;
+ }
+
+ if (stat(suggested_paths[current_suggested++], &st) < 0)
+ continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */
+
+ candidate = st.st_uid;
+ break;
+ }
+
+ case PHASE_HASHED:
+ /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We
+ * start with a UID generated as hash from the user name. */
+ candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key));
+
+ /* If this one fails, we should proceed with random tries */
+ phase = PHASE_RANDOM;
+ break;
+
+ case PHASE_RANDOM:
+
+ /* Pick another random UID, and see if that works for us. */
+ random_bytes(&candidate, sizeof(candidate));
+ candidate = UID_CLAMP_INTO_RANGE(candidate);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Make sure whatever we picked here actually is in the right range */
+ if (!uid_is_dynamic(candidate))
+ continue;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate);
+
+ for (;;) {
+ struct stat st;
+
+ lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600);
+ if (lock_fd < 0)
+ return -errno;
+
+ r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */
+ if (r < 0) {
+ if (IN_SET(errno, EBUSY, EAGAIN))
+ goto next; /* already in use */
+
+ return -errno;
+ }
+
+ if (fstat(lock_fd, &st) < 0)
+ return -errno;
+ if (st.st_nlink > 0)
+ break;
+
+ /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and
+ * got the lock. Close it, and try again. */
+ lock_fd = safe_close(lock_fd);
+ }
+
+ /* Some superficial check whether this UID/GID might already be taken by some static user */
+ if (getpwuid(candidate) ||
+ getgrgid((gid_t) candidate) ||
+ search_ipc(candidate, (gid_t) candidate) != 0) {
+ (void) unlink(lock_path);
+ continue;
+ }
+
+ /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */
+ l = pwritev(lock_fd,
+ (struct iovec[2]) {
+ IOVEC_INIT_STRING(name),
+ IOVEC_INIT((char[1]) { '\n' }, 1),
+ }, 2, 0);
+ if (l < 0) {
+ r = -errno;
+ (void) unlink(lock_path);
+ return r;
+ }
+
+ (void) ftruncate(lock_fd, l);
+ (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */
+
+ *ret_uid = candidate;
+ return TAKE_FD(lock_fd);
+
+ next:
+ ;
+ }
+}
+
+static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) {
+ uid_t uid = UID_INVALID;
+ struct iovec iov = IOVEC_INIT(&uid, sizeof(uid));
+ int lock_fd;
+ ssize_t k;
+
+ assert(d);
+ assert(ret_uid);
+ assert(ret_lock_fd);
+
+ /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with
+ * the lock on the socket taken. */
+
+ k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd);
+ if (k < 0)
+ return (int) k;
+
+ *ret_uid = uid;
+ *ret_lock_fd = lock_fd;
+
+ return 0;
+}
+
+static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) {
+ struct iovec iov = IOVEC_INIT(&uid, sizeof(uid));
+
+ assert(d);
+
+ /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */
+ return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT);
+}
+
+static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+
+ if (lock_fd < 0)
+ return;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+ (void) unlink(lock_path);
+
+ (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */
+}
+
+static int lockfp(int fd, int *fd_lock) {
+ if (lockf(fd, F_LOCK, 0) < 0)
+ return -errno;
+ *fd_lock = fd;
+ return 0;
+}
+
+static void unlockfp(int *fd_lock) {
+ if (*fd_lock < 0)
+ return;
+ lockf(*fd_lock, F_ULOCK, 0);
+ *fd_lock = -1;
+}
+
+static int dynamic_user_realize(
+ DynamicUser *d,
+ char **suggested_dirs,
+ uid_t *ret_uid, gid_t *ret_gid,
+ bool is_user) {
+
+ _cleanup_(unlockfp) int storage_socket0_lock = -1;
+ _cleanup_close_ int uid_lock_fd = -1;
+ _cleanup_close_ int etc_passwd_lock_fd = -1;
+ uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */
+ gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */
+ bool flush_cache = false;
+ int r;
+
+ assert(d);
+ assert(is_user == !!ret_uid);
+ assert(ret_gid);
+
+ /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist
+ * yet. If it already exists its existing UID/GID will be reused. */
+
+ r = lockfp(d->storage_socket[0], &storage_socket0_lock);
+ if (r < 0)
+ return r;
+
+ r = dynamic_user_pop(d, &num, &uid_lock_fd);
+ if (r < 0) {
+ int new_uid_lock_fd;
+ uid_t new_uid;
+
+ if (r != -EAGAIN)
+ return r;
+
+ /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the
+ * lock however, so that nobody else blocks on our NSS lookups. */
+ unlockfp(&storage_socket0_lock);
+
+ /* Let's see if a proper, static user or group by this name exists. Try to take the lock on
+ * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't
+ * take the lock, given that users can't be added there anyway in this case. */
+ etc_passwd_lock_fd = take_etc_passwd_lock(NULL);
+ if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS)
+ return etc_passwd_lock_fd;
+
+ /* First, let's parse this as numeric UID */
+ r = parse_uid(d->name, &num);
+ if (r < 0) {
+ struct passwd *p;
+ struct group *g;
+
+ if (is_user) {
+ /* OK, this is not a numeric UID. Let's see if there's a user by this name */
+ p = getpwnam(d->name);
+ if (p) {
+ num = p->pw_uid;
+ gid = p->pw_gid;
+ } else {
+ /* if the user does not exist but the group with the same name exists, refuse operation */
+ g = getgrnam(d->name);
+ if (g)
+ return -EILSEQ;
+ }
+ } else {
+ /* Let's see if there's a group by this name */
+ g = getgrnam(d->name);
+ if (g)
+ num = (uid_t) g->gr_gid;
+ else {
+ /* if the group does not exist but the user with the same name exists, refuse operation */
+ p = getpwnam(d->name);
+ if (p)
+ return -EILSEQ;
+ }
+ }
+ }
+
+ if (num == UID_INVALID) {
+ /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */
+
+ uid_lock_fd = pick_uid(suggested_dirs, d->name, &num);
+ if (uid_lock_fd < 0)
+ return uid_lock_fd;
+ }
+
+ /* So, we found a working UID/lock combination. Let's see if we actually still need it. */
+ r = lockfp(d->storage_socket[0], &storage_socket0_lock);
+ if (r < 0) {
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ return r;
+ }
+
+ r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd);
+ if (r < 0) {
+ if (r != -EAGAIN) {
+ /* OK, something bad happened, let's get rid of the bits we acquired. */
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ return r;
+ }
+
+ /* Great! Nothing is stored here, still. Store our newly acquired data. */
+ flush_cache = true;
+ } else {
+ /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we
+ * acquired, and use what's stored now. */
+
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ safe_close(uid_lock_fd);
+
+ num = new_uid;
+ uid_lock_fd = new_uid_lock_fd;
+ }
+ } else if (is_user && !uid_is_dynamic(num)) {
+ struct passwd *p;
+
+ /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */
+ errno = 0;
+ p = getpwuid(num);
+ if (!p)
+ return errno_or_else(ESRCH);
+
+ gid = p->pw_gid;
+ }
+
+ /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already
+ * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID
+ * dynamically right here, push that in along with the lock fd for it. */
+ r = dynamic_user_push(d, num, uid_lock_fd);
+ if (r < 0)
+ return r;
+
+ if (flush_cache) {
+ /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached
+ * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no
+ * potential for nscd wanting to lock that for completing the invalidation. */
+ etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd);
+ (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+ }
+
+ if (is_user) {
+ *ret_uid = num;
+ *ret_gid = gid != GID_INVALID ? gid : num;
+ } else
+ *ret_gid = num;
+
+ return 0;
+}
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret) {
+ _cleanup_(unlockfp) int storage_socket0_lock = -1;
+ _cleanup_close_ int lock_fd = -1;
+ uid_t uid;
+ int r;
+
+ assert(d);
+
+ /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the
+ * storage socket, and pushes it back in right-away. */
+
+ r = lockfp(d->storage_socket[0], &storage_socket0_lock);
+ if (r < 0)
+ return r;
+
+ r = dynamic_user_pop(d, &uid, &lock_fd);
+ if (r < 0)
+ return r;
+
+ r = dynamic_user_push(d, uid, lock_fd);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ *ret = uid;
+
+ return 0;
+}
+
+static DynamicUser* dynamic_user_unref(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully
+ * destroyed and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may
+ * contain entries with no references, which is commonly the case right before a daemon reload. */
+
+ assert(d->n_ref > 0);
+ d->n_ref--;
+
+ return NULL;
+}
+
+static int dynamic_user_close(DynamicUser *d) {
+ _cleanup_(unlockfp) int storage_socket0_lock = -1;
+ _cleanup_close_ int lock_fd = -1;
+ uid_t uid;
+ int r;
+
+ /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the
+ * user is unrealized again, much like it was after it the DynamicUser object was first allocated. */
+
+ r = lockfp(d->storage_socket[0], &storage_socket0_lock);
+ if (r < 0)
+ return r;
+
+ r = dynamic_user_pop(d, &uid, &lock_fd);
+ if (r == -EAGAIN)
+ /* User wasn't realized yet, nothing to do. */
+ return 0;
+ if (r < 0)
+ return r;
+
+ /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */
+ unlink_uid_lock(lock_fd, uid, d->name);
+
+ (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+ return 1;
+}
+
+static DynamicUser* dynamic_user_destroy(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last
+ * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that
+ * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload
+ * cycle, where the dynamic users should not be destroyed, but our datastructures should. */
+
+ dynamic_user_unref(d);
+
+ if (d->n_ref > 0)
+ return NULL;
+
+ (void) dynamic_user_close(d);
+ return dynamic_user_free(d);
+}
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) {
+ DynamicUser *d;
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ int copy0, copy1;
+
+ copy0 = fdset_put_dup(fds, d->storage_socket[0]);
+ if (copy0 < 0)
+ return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m");
+
+ copy1 = fdset_put_dup(fds, d->storage_socket[1]);
+ if (copy1 < 0)
+ return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m");
+
+ (void) serialize_item_format(f, "dynamic-user", "%s %i %i", d->name, copy0, copy1);
+ }
+
+ return 0;
+}
+
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+ _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL;
+ int r, fd0, fd1;
+
+ assert(m);
+ assert(value);
+ assert(fds);
+
+ /* Parse the serialization again, after a daemon reload */
+
+ r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL);
+ if (r != 3 || !isempty(value)) {
+ log_debug("Unable to parse dynamic user line.");
+ return;
+ }
+
+ if (safe_atoi(s0, &fd0) < 0 || !fdset_contains(fds, fd0)) {
+ log_debug("Unable to process dynamic user fd specification.");
+ return;
+ }
+
+ if (safe_atoi(s1, &fd1) < 0 || !fdset_contains(fds, fd1)) {
+ log_debug("Unable to process dynamic user fd specification.");
+ return;
+ }
+
+ r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, NULL);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add dynamic user: %m");
+ return;
+ }
+
+ (void) fdset_remove(fds, fd0);
+ (void) fdset_remove(fds, fd1);
+}
+
+void dynamic_user_vacuum(Manager *m, bool close_user) {
+ DynamicUser *d;
+
+ assert(m);
+
+ /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users
+ * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which
+ * might not be referenced anymore. */
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ if (d->n_ref > 0)
+ continue;
+
+ if (close_user) {
+ log_debug("Removing orphaned dynamic user %s", d->name);
+ (void) dynamic_user_close(d);
+ }
+
+ dynamic_user_free(d);
+ }
+}
+
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+ _cleanup_free_ char *user = NULL;
+ uid_t check_uid;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* A friendly way to translate a dynamic user's UID into a name. */
+ if (!uid_is_dynamic(uid))
+ return -ESRCH;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+ r = read_one_line_file(lock_path, &user);
+ if (IN_SET(r, -ENOENT, 0))
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ /* The lock file might be stale, hence let's verify the data before we return it */
+ r = dynamic_user_lookup_name(m, user, &check_uid);
+ if (r < 0)
+ return r;
+ if (check_uid != uid) /* lock file doesn't match our own idea */
+ return -ESRCH;
+
+ *ret = TAKE_PTR(user);
+
+ return 0;
+}
+
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) {
+ DynamicUser *d;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ /* A friendly call for translating a dynamic user's name into its UID */
+
+ d = hashmap_get(m->dynamic_users, name);
+ if (!d)
+ return -ESRCH;
+
+ r = dynamic_user_current(d, ret);
+ if (r == -EAGAIN) /* not realized yet? */
+ return -ESRCH;
+
+ return r;
+}
+
+int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group) {
+ bool acquired = false;
+ int r;
+
+ assert(creds);
+ assert(m);
+
+ /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some
+ * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user
+ * and group. This call allocates a pair. */
+
+ if (!creds->user && user) {
+ r = dynamic_user_acquire(m, user, &creds->user);
+ if (r < 0)
+ return r;
+
+ acquired = true;
+ }
+
+ if (!creds->group) {
+
+ if (creds->user && (!group || streq_ptr(user, group)))
+ creds->group = dynamic_user_ref(creds->user);
+ else if (group) {
+ r = dynamic_user_acquire(m, group, &creds->group);
+ if (r < 0) {
+ if (acquired)
+ creds->user = dynamic_user_unref(creds->user);
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) {
+ uid_t u = UID_INVALID;
+ gid_t g = GID_INVALID;
+ int r;
+
+ assert(creds);
+ assert(uid);
+ assert(gid);
+
+ /* Realize both the referenced user and group */
+
+ if (creds->user) {
+ r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true);
+ if (r < 0)
+ return r;
+ }
+
+ if (creds->group && creds->group != creds->user) {
+ r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false);
+ if (r < 0)
+ return r;
+ }
+
+ *uid = u;
+ *gid = g;
+ return 0;
+}
+
+void dynamic_creds_unref(DynamicCreds *creds) {
+ assert(creds);
+
+ creds->user = dynamic_user_unref(creds->user);
+ creds->group = dynamic_user_unref(creds->group);
+}
+
+void dynamic_creds_destroy(DynamicCreds *creds) {
+ assert(creds);
+
+ creds->user = dynamic_user_destroy(creds->user);
+ creds->group = dynamic_user_destroy(creds->group);
+}
diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h
new file mode 100644
index 0000000..847ef47
--- /dev/null
+++ b/src/core/dynamic-user.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct DynamicUser DynamicUser;
+
+typedef struct DynamicCreds {
+ /* A combination of a dynamic user and group */
+ DynamicUser *user;
+ DynamicUser *group;
+} DynamicCreds;
+
+#include "manager.h"
+
+/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't
+ * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you
+ * need to allocated two of these objects. DynamicCreds below makes that easy. */
+struct DynamicUser {
+ Manager *manager;
+ unsigned n_ref;
+
+ /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock
+ * file fd locking the user ID we picked. */
+ int storage_socket[2];
+
+ char name[];
+};
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds);
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds);
+void dynamic_user_vacuum(Manager *m, bool close_user);
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret);
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret);
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret);
+
+int dynamic_creds_acquire(DynamicCreds *creds, Manager *m, const char *user, const char *group);
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid);
+
+void dynamic_creds_unref(DynamicCreds *creds);
+void dynamic_creds_destroy(DynamicCreds *creds);
diff --git a/src/core/efi-random.c b/src/core/efi-random.c
new file mode 100644
index 0000000..4086b12
--- /dev/null
+++ b/src/core/efi-random.c
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "chattr-util.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+/* If a random seed was passed by the boot loader in the LoaderRandomSeed EFI variable, let's credit it to
+ * the kernel's random pool, but only once per boot. If this is run very early during initialization we can
+ * instantly boot up with a filled random pool.
+ *
+ * This makes no judgement on the entropy passed, it's the job of the boot loader to only pass us a seed that
+ * is suitably validated. */
+
+static void lock_down_efi_variables(void) {
+ int r;
+
+ /* Paranoia: let's restrict access modes of these a bit, so that unprivileged users can't use them to
+ * identify the system or gain too much insight into what we might have credited to the entropy
+ * pool. */
+ FOREACH_STRING(path,
+ EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderRandomSeed)),
+ EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderSystemToken))) {
+
+ r = chattr_path(path, 0, FS_IMMUTABLE_FL, NULL);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0)
+ log_warning_errno(r, "Failed to drop FS_IMMUTABLE_FL from %s, ignoring: %m", path);
+
+ if (chmod(path, 0600) < 0)
+ log_warning_errno(errno, "Failed to reduce access mode of %s, ignoring: %m", path);
+ }
+}
+
+int efi_take_random_seed(void) {
+ _cleanup_free_ void *value = NULL;
+ size_t size;
+ int r;
+
+ /* Paranoia comes first. */
+ lock_down_efi_variables();
+
+ if (access("/run/systemd/efi-random-seed-taken", F_OK) < 0) {
+ if (errno != ENOENT) {
+ log_warning_errno(errno, "Failed to determine whether we already used the random seed token, not using it.");
+ return 0;
+ }
+
+ /* ENOENT means we haven't used it yet. */
+ } else {
+ log_debug("EFI random seed already used, not using again.");
+ return 0;
+ }
+
+ r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderRandomSeed), NULL, &value, &size);
+ if (r == -EOPNOTSUPP) {
+ log_debug_errno(r, "System lacks EFI support, not initializing random seed from EFI variable.");
+ return 0;
+ }
+ if (r == -ENOENT) {
+ log_debug_errno(r, "Boot loader did not pass LoaderRandomSeed EFI variable, not crediting any entropy.");
+ return 0;
+ }
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read LoaderRandomSeed EFI variable, ignoring: %m");
+
+ if (size == 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Random seed passed from boot loader has zero size? Ignoring.");
+
+ /* Before we use the seed, let's mark it as used, so that we never credit it twice. Also, it's a nice
+ * way to let users known that we successfully acquired entropy from the boot loader. */
+ r = touch("/run/systemd/efi-random-seed-taken");
+ if (r < 0)
+ return log_warning_errno(r, "Unable to mark EFI random seed as used, not using it: %m");
+
+ r = random_write_entropy(-1, value, size, true);
+ if (r < 0)
+ return log_warning_errno(errno, "Failed to credit entropy, ignoring: %m");
+
+ log_info("Successfully credited entropy passed from boot loader.");
+ return 1;
+}
diff --git a/src/core/efi-random.h b/src/core/efi-random.h
new file mode 100644
index 0000000..7d20fff
--- /dev/null
+++ b/src/core/efi-random.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int efi_take_random_seed(void);
diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c
new file mode 100644
index 0000000..0234772
--- /dev/null
+++ b/src/core/emergency-action.c
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/reboot.h>
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "emergency-action.h"
+#include "raw-reboot.h"
+#include "reboot-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = {
+ [EMERGENCY_ACTION_NONE] = "none",
+ [EMERGENCY_ACTION_REBOOT] = "reboot",
+ [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force",
+ [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate",
+ [EMERGENCY_ACTION_POWEROFF] = "poweroff",
+ [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force",
+ [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate",
+ [EMERGENCY_ACTION_EXIT] = "exit",
+ [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force",
+};
+
+static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) {
+ log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason);
+ if (warn)
+ manager_status_printf(m, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
+ "%s: %s", message, reason);
+}
+
+void emergency_action(
+ Manager *m,
+ EmergencyAction action,
+ EmergencyActionFlags options,
+ const char *reboot_arg,
+ int exit_status,
+ const char *reason) {
+
+ Unit *u;
+
+ assert(m);
+ assert(action >= 0);
+ assert(action < _EMERGENCY_ACTION_MAX);
+
+ /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+ if (IN_SET(action, EMERGENCY_ACTION_REBOOT, EMERGENCY_ACTION_POWEROFF, EMERGENCY_ACTION_EXIT)) {
+ u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+ if (u && unit_active_or_pending(u)) {
+ log_notice("Shutdown is already active. Skipping emergency action request %s.",
+ emergency_action_table[action]);
+ return;
+ }
+ }
+
+ if (action == EMERGENCY_ACTION_NONE)
+ return;
+
+ if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) {
+ log_warning("Watchdog disabled! Not acting on: %s", reason);
+ return;
+ }
+
+ bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN);
+
+ switch (action) {
+
+ case EMERGENCY_ACTION_REBOOT:
+ log_and_status(m, warn, "Rebooting", reason);
+
+ (void) update_reboot_parameter_and_warn(reboot_arg, true);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_REBOOT_FORCE:
+ log_and_status(m, warn, "Forcibly rebooting", reason);
+
+ (void) update_reboot_parameter_and_warn(reboot_arg, true);
+ m->objective = MANAGER_REBOOT;
+
+ break;
+
+ case EMERGENCY_ACTION_REBOOT_IMMEDIATE:
+ log_and_status(m, warn, "Rebooting immediately", reason);
+
+ sync();
+
+ if (!isempty(reboot_arg)) {
+ log_info("Rebooting with argument '%s'.", reboot_arg);
+ (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg);
+ log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m");
+ }
+
+ log_info("Rebooting.");
+ (void) reboot(RB_AUTOBOOT);
+ break;
+
+ case EMERGENCY_ACTION_EXIT:
+
+ if (exit_status >= 0)
+ m->return_value = exit_status;
+
+ if (MANAGER_IS_USER(m) || detect_container() > 0) {
+ log_and_status(m, warn, "Exiting", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+ }
+
+ log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action.");
+ _fallthrough_;
+
+ case EMERGENCY_ACTION_POWEROFF:
+ log_and_status(m, warn, "Powering off", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_EXIT_FORCE:
+
+ if (exit_status >= 0)
+ m->return_value = exit_status;
+
+ if (MANAGER_IS_USER(m) || detect_container() > 0) {
+ log_and_status(m, warn, "Exiting immediately", reason);
+ m->objective = MANAGER_EXIT;
+ break;
+ }
+
+ log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action.");
+ _fallthrough_;
+
+ case EMERGENCY_ACTION_POWEROFF_FORCE:
+ log_and_status(m, warn, "Forcibly powering off", reason);
+ m->objective = MANAGER_POWEROFF;
+ break;
+
+ case EMERGENCY_ACTION_POWEROFF_IMMEDIATE:
+ log_and_status(m, warn, "Powering off immediately", reason);
+
+ sync();
+
+ log_info("Powering off.");
+ (void) reboot(RB_POWER_OFF);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction);
+
+int parse_emergency_action(
+ const char *value,
+ bool system,
+ EmergencyAction *ret) {
+
+ EmergencyAction x;
+
+ x = emergency_action_from_string(value);
+ if (x < 0)
+ return -EINVAL;
+
+ if (!system && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION)
+ return -EOPNOTSUPP;
+
+ *ret = x;
+ return 0;
+}
diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h
new file mode 100644
index 0000000..4d7e755
--- /dev/null
+++ b/src/core/emergency-action.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+
+typedef enum EmergencyAction {
+ EMERGENCY_ACTION_NONE,
+ EMERGENCY_ACTION_REBOOT,
+ EMERGENCY_ACTION_REBOOT_FORCE,
+ EMERGENCY_ACTION_REBOOT_IMMEDIATE,
+ EMERGENCY_ACTION_POWEROFF,
+ EMERGENCY_ACTION_POWEROFF_FORCE,
+ EMERGENCY_ACTION_POWEROFF_IMMEDIATE,
+ EMERGENCY_ACTION_EXIT,
+ _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT,
+ EMERGENCY_ACTION_EXIT_FORCE,
+ _EMERGENCY_ACTION_MAX,
+ _EMERGENCY_ACTION_INVALID = -EINVAL,
+} EmergencyAction;
+
+typedef enum EmergencyActionFlags {
+ EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0,
+ EMERGENCY_ACTION_WARN = 1 << 1,
+} EmergencyActionFlags;
+
+#include "macro.h"
+#include "manager.h"
+
+void emergency_action(Manager *m,
+ EmergencyAction action, EmergencyActionFlags options,
+ const char *reboot_arg, int exit_status, const char *reason);
+
+const char* emergency_action_to_string(EmergencyAction i) _const_;
+EmergencyAction emergency_action_from_string(const char *s) _pure_;
+
+int parse_emergency_action(const char *value, bool system, EmergencyAction *ret);
diff --git a/src/core/execute.c b/src/core/execute.c
new file mode 100644
index 0000000..9715d02
--- /dev/null
+++ b/src/core/execute.c
@@ -0,0 +1,7278 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/personality.h>
+#include <sys/prctl.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <utmpx.h>
+
+#if HAVE_PAM
+#include <security/pam_appl.h>
+#endif
+
+#if HAVE_SELINUX
+#include <selinux/selinux.h>
+#endif
+
+#if HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+
+#if HAVE_APPARMOR
+#include <sys/apparmor.h>
+#endif
+
+#include "sd-messages.h"
+
+#include "acl-util.h"
+#include "af-list.h"
+#include "alloc-util.h"
+#if HAVE_APPARMOR
+#include "apparmor-util.h"
+#endif
+#include "async.h"
+#include "barrier.h"
+#include "bpf-lsm.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "chase-symlinks.h"
+#include "chown-recursive.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "data-fd-util.h"
+#include "def.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "glob-util.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "ioprio-util.h"
+#include "label.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "memory-util.h"
+#include "missing_fs.h"
+#include "missing_ioprio.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "recurse-dir.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "utmp-wtmp.h"
+
+#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
+#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
+
+#define SNDBUF_SIZE (8*1024*1024)
+
+static int shift_fds(int fds[], size_t n_fds) {
+ if (n_fds <= 0)
+ return 0;
+
+ /* Modifies the fds array! (sorts it) */
+
+ assert(fds);
+
+ for (int start = 0;;) {
+ int restart_from = -1;
+
+ for (int i = start; i < (int) n_fds; i++) {
+ int nfd;
+
+ /* Already at right index? */
+ if (fds[i] == i+3)
+ continue;
+
+ nfd = fcntl(fds[i], F_DUPFD, i + 3);
+ if (nfd < 0)
+ return -errno;
+
+ safe_close(fds[i]);
+ fds[i] = nfd;
+
+ /* Hmm, the fd we wanted isn't free? Then
+ * let's remember that and try again from here */
+ if (nfd != i+3 && restart_from < 0)
+ restart_from = i;
+ }
+
+ if (restart_from < 0)
+ break;
+
+ start = restart_from;
+ }
+
+ return 0;
+}
+
+static int flag_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
+ size_t n_fds;
+ int r;
+
+ n_fds = n_socket_fds + n_storage_fds;
+ if (n_fds <= 0)
+ return 0;
+
+ assert(fds);
+ assert(fds || n_fds == 0);
+
+ /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
+ * O_NONBLOCK only applies to socket activation though. */
+
+ for (size_t i = 0; i < n_fds; i++) {
+
+ if (i < n_socket_fds) {
+ r = fd_nonblock(fds[i], nonblock);
+ if (r < 0)
+ return r;
+ }
+
+ /* We unconditionally drop FD_CLOEXEC from the fds,
+ * since after all we want to pass these fds to our
+ * children */
+
+ r = fd_cloexec(fds[i], false);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static const char *exec_context_tty_path(const ExecContext *context) {
+ assert(context);
+
+ if (context->stdio_as_fds)
+ return NULL;
+
+ if (context->tty_path)
+ return context->tty_path;
+
+ return "/dev/console";
+}
+
+static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
+ const char *path;
+
+ assert(context);
+
+ path = exec_context_tty_path(context);
+
+ if (context->tty_vhangup) {
+ if (p && p->stdin_fd >= 0)
+ (void) terminal_vhangup_fd(p->stdin_fd);
+ else if (path)
+ (void) terminal_vhangup(path);
+ }
+
+ if (context->tty_reset) {
+ if (p && p->stdin_fd >= 0)
+ (void) reset_terminal_fd(p->stdin_fd, true);
+ else if (path)
+ (void) reset_terminal(path);
+ }
+
+ if (p && p->stdin_fd >= 0)
+ (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
+
+ if (context->tty_vt_disallocate && path)
+ (void) vt_disallocate(path);
+}
+
+static bool is_terminal_input(ExecInput i) {
+ return IN_SET(i,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL);
+}
+
+static bool is_terminal_output(ExecOutput o) {
+ return IN_SET(o,
+ EXEC_OUTPUT_TTY,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+static bool is_kmsg_output(ExecOutput o) {
+ return IN_SET(o,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE);
+}
+
+static bool exec_context_needs_term(const ExecContext *c) {
+ assert(c);
+
+ /* Return true if the execution context suggests we should set $TERM to something useful. */
+
+ if (is_terminal_input(c->std_input))
+ return true;
+
+ if (is_terminal_output(c->std_output))
+ return true;
+
+ if (is_terminal_output(c->std_error))
+ return true;
+
+ return !!c->tty_path;
+}
+
+static int open_null_as(int flags, int nfd) {
+ int fd;
+
+ assert(nfd >= 0);
+
+ fd = open("/dev/null", flags|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ return move_fd(fd, nfd, false);
+}
+
+static int connect_journal_socket(
+ int fd,
+ const char *log_namespace,
+ uid_t uid,
+ gid_t gid) {
+
+ uid_t olduid = UID_INVALID;
+ gid_t oldgid = GID_INVALID;
+ const char *j;
+ int r;
+
+ j = log_namespace ?
+ strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
+ "/run/systemd/journal/stdout";
+
+ if (gid_is_valid(gid)) {
+ oldgid = getgid();
+
+ if (setegid(gid) < 0)
+ return -errno;
+ }
+
+ if (uid_is_valid(uid)) {
+ olduid = getuid();
+
+ if (seteuid(uid) < 0) {
+ r = -errno;
+ goto restore_gid;
+ }
+ }
+
+ r = connect_unix_path(fd, AT_FDCWD, j);
+
+ /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
+ an LSM interferes. */
+
+ if (uid_is_valid(uid))
+ (void) seteuid(olduid);
+
+ restore_gid:
+ if (gid_is_valid(gid))
+ (void) setegid(oldgid);
+
+ return r;
+}
+
+static int connect_logger_as(
+ const Unit *unit,
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecOutput output,
+ const char *ident,
+ int nfd,
+ uid_t uid,
+ gid_t gid) {
+
+ _cleanup_close_ int fd = -1;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(output < _EXEC_OUTPUT_MAX);
+ assert(ident);
+ assert(nfd >= 0);
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -errno;
+
+ r = connect_journal_socket(fd, context->log_namespace, uid, gid);
+ if (r < 0)
+ return r;
+
+ if (shutdown(fd, SHUT_RD) < 0)
+ return -errno;
+
+ (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
+
+ if (dprintf(fd,
+ "%s\n"
+ "%s\n"
+ "%i\n"
+ "%i\n"
+ "%i\n"
+ "%i\n"
+ "%i\n",
+ context->syslog_identifier ?: ident,
+ params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
+ context->syslog_priority,
+ !!context->syslog_level_prefix,
+ false,
+ is_kmsg_output(output),
+ is_terminal_output(output)) < 0)
+ return -errno;
+
+ return move_fd(TAKE_FD(fd), nfd, false);
+}
+
+static int open_terminal_as(const char *path, int flags, int nfd) {
+ int fd;
+
+ assert(path);
+ assert(nfd >= 0);
+
+ fd = open_terminal(path, flags | O_NOCTTY);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, nfd, false);
+}
+
+static int acquire_path(const char *path, int flags, mode_t mode) {
+ _cleanup_close_ int fd = -1;
+ int r;
+
+ assert(path);
+
+ if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
+ flags |= O_CREAT;
+
+ fd = open(path, flags|O_NOCTTY, mode);
+ if (fd >= 0)
+ return TAKE_FD(fd);
+
+ if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
+ return -errno;
+
+ /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -errno;
+
+ r = connect_unix_path(fd, AT_FDCWD, path);
+ if (IN_SET(r, -ENOTSOCK, -EINVAL))
+ /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
+ * wasn't an AF_UNIX socket after all */
+ return -ENXIO;
+ if (r < 0)
+ return r;
+
+ if ((flags & O_ACCMODE) == O_RDONLY)
+ r = shutdown(fd, SHUT_WR);
+ else if ((flags & O_ACCMODE) == O_WRONLY)
+ r = shutdown(fd, SHUT_RD);
+ else
+ r = 0;
+ if (r < 0)
+ return -errno;
+
+ return TAKE_FD(fd);
+}
+
+static int fixup_input(
+ const ExecContext *context,
+ int socket_fd,
+ bool apply_tty_stdin) {
+
+ ExecInput std_input;
+
+ assert(context);
+
+ std_input = context->std_input;
+
+ if (is_terminal_input(std_input) && !apply_tty_stdin)
+ return EXEC_INPUT_NULL;
+
+ if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
+ return EXEC_INPUT_NULL;
+
+ if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
+ return EXEC_INPUT_NULL;
+
+ return std_input;
+}
+
+static int fixup_output(ExecOutput output, int socket_fd) {
+
+ if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
+ return EXEC_OUTPUT_INHERIT;
+
+ return output;
+}
+
+static int setup_input(
+ const ExecContext *context,
+ const ExecParameters *params,
+ int socket_fd,
+ const int named_iofds[static 3]) {
+
+ ExecInput i;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(named_iofds);
+
+ if (params->stdin_fd >= 0) {
+ if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
+ return -errno;
+
+ /* Try to make this the controlling tty, if it is a tty, and reset it */
+ if (isatty(STDIN_FILENO)) {
+ (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
+ (void) reset_terminal_fd(STDIN_FILENO, true);
+ (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
+ }
+
+ return STDIN_FILENO;
+ }
+
+ i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+
+ switch (i) {
+
+ case EXEC_INPUT_NULL:
+ return open_null_as(O_RDONLY, STDIN_FILENO);
+
+ case EXEC_INPUT_TTY:
+ case EXEC_INPUT_TTY_FORCE:
+ case EXEC_INPUT_TTY_FAIL: {
+ int fd;
+
+ fd = acquire_terminal(exec_context_tty_path(context),
+ i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
+ i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
+ ACQUIRE_TERMINAL_WAIT,
+ USEC_INFINITY);
+ if (fd < 0)
+ return fd;
+
+ r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
+ if (r < 0)
+ return r;
+
+ return move_fd(fd, STDIN_FILENO, false);
+ }
+
+ case EXEC_INPUT_SOCKET:
+ assert(socket_fd >= 0);
+
+ return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
+
+ case EXEC_INPUT_NAMED_FD:
+ assert(named_iofds[STDIN_FILENO] >= 0);
+
+ (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
+ return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
+
+ case EXEC_INPUT_DATA: {
+ int fd;
+
+ fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, STDIN_FILENO, false);
+ }
+
+ case EXEC_INPUT_FILE: {
+ bool rw;
+ int fd;
+
+ assert(context->stdio_file[STDIN_FILENO]);
+
+ rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
+ (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
+
+ fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, STDIN_FILENO, false);
+ }
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static bool can_inherit_stderr_from_stdout(
+ const ExecContext *context,
+ ExecOutput o,
+ ExecOutput e) {
+
+ assert(context);
+
+ /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
+ * stderr fd */
+
+ if (e == EXEC_OUTPUT_INHERIT)
+ return true;
+ if (e != o)
+ return false;
+
+ if (e == EXEC_OUTPUT_NAMED_FD)
+ return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
+
+ if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
+ return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
+
+ return true;
+}
+
+static int setup_output(
+ const Unit *unit,
+ const ExecContext *context,
+ const ExecParameters *params,
+ int fileno,
+ int socket_fd,
+ const int named_iofds[static 3],
+ const char *ident,
+ uid_t uid,
+ gid_t gid,
+ dev_t *journal_stream_dev,
+ ino_t *journal_stream_ino) {
+
+ ExecOutput o;
+ ExecInput i;
+ int r;
+
+ assert(unit);
+ assert(context);
+ assert(params);
+ assert(ident);
+ assert(journal_stream_dev);
+ assert(journal_stream_ino);
+
+ if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
+
+ if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
+ return -errno;
+
+ return STDOUT_FILENO;
+ }
+
+ if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
+ if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
+ return -errno;
+
+ return STDERR_FILENO;
+ }
+
+ i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+ o = fixup_output(context->std_output, socket_fd);
+
+ if (fileno == STDERR_FILENO) {
+ ExecOutput e;
+ e = fixup_output(context->std_error, socket_fd);
+
+ /* This expects the input and output are already set up */
+
+ /* Don't change the stderr file descriptor if we inherit all
+ * the way and are not on a tty */
+ if (e == EXEC_OUTPUT_INHERIT &&
+ o == EXEC_OUTPUT_INHERIT &&
+ i == EXEC_INPUT_NULL &&
+ !is_terminal_input(context->std_input) &&
+ getppid() != 1)
+ return fileno;
+
+ /* Duplicate from stdout if possible */
+ if (can_inherit_stderr_from_stdout(context, o, e))
+ return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
+
+ o = e;
+
+ } else if (o == EXEC_OUTPUT_INHERIT) {
+ /* If input got downgraded, inherit the original value */
+ if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
+ return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+ /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
+ if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
+ if (getppid() != 1)
+ return fileno;
+
+ /* We need to open /dev/null here anew, to get the right access mode. */
+ return open_null_as(O_WRONLY, fileno);
+ }
+
+ switch (o) {
+
+ case EXEC_OUTPUT_NULL:
+ return open_null_as(O_WRONLY, fileno);
+
+ case EXEC_OUTPUT_TTY:
+ if (is_terminal_input(i))
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ /* We don't reset the terminal if this is just about output */
+ return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+ case EXEC_OUTPUT_KMSG:
+ case EXEC_OUTPUT_KMSG_AND_CONSOLE:
+ case EXEC_OUTPUT_JOURNAL:
+ case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
+ r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
+ if (r < 0) {
+ log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
+ fileno == STDOUT_FILENO ? "stdout" : "stderr");
+ r = open_null_as(O_WRONLY, fileno);
+ } else {
+ struct stat st;
+
+ /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
+ * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
+ * services to detect whether they are connected to the journal or not.
+ *
+ * If both stdout and stderr are connected to a stream then let's make sure to store the data
+ * about STDERR as that's usually the best way to do logging. */
+
+ if (fstat(fileno, &st) >= 0 &&
+ (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
+ *journal_stream_dev = st.st_dev;
+ *journal_stream_ino = st.st_ino;
+ }
+ }
+ return r;
+
+ case EXEC_OUTPUT_SOCKET:
+ assert(socket_fd >= 0);
+
+ return RET_NERRNO(dup2(socket_fd, fileno));
+
+ case EXEC_OUTPUT_NAMED_FD:
+ assert(named_iofds[fileno] >= 0);
+
+ (void) fd_nonblock(named_iofds[fileno], false);
+ return RET_NERRNO(dup2(named_iofds[fileno], fileno));
+
+ case EXEC_OUTPUT_FILE:
+ case EXEC_OUTPUT_FILE_APPEND:
+ case EXEC_OUTPUT_FILE_TRUNCATE: {
+ bool rw;
+ int fd, flags;
+
+ assert(context->stdio_file[fileno]);
+
+ rw = context->std_input == EXEC_INPUT_FILE &&
+ streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
+
+ if (rw)
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ flags = O_WRONLY;
+ if (o == EXEC_OUTPUT_FILE_APPEND)
+ flags |= O_APPEND;
+ else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
+ flags |= O_TRUNC;
+
+ fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, fileno, 0);
+ }
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int chown_terminal(int fd, uid_t uid) {
+ int r;
+
+ assert(fd >= 0);
+
+ /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
+ if (isatty(fd) < 1) {
+ if (IN_SET(errno, EINVAL, ENOTTY))
+ return 0; /* not a tty */
+
+ return -errno;
+ }
+
+ /* This might fail. What matters are the results. */
+ r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int setup_confirm_stdio(
+ const ExecContext *context,
+ const char *vc,
+ int *ret_saved_stdin,
+ int *ret_saved_stdout) {
+
+ _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
+ int r;
+
+ assert(ret_saved_stdin);
+ assert(ret_saved_stdout);
+
+ saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
+ if (saved_stdin < 0)
+ return -errno;
+
+ saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
+ if (saved_stdout < 0)
+ return -errno;
+
+ fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
+ if (fd < 0)
+ return fd;
+
+ r = chown_terminal(fd, getuid());
+ if (r < 0)
+ return r;
+
+ r = reset_terminal_fd(fd, true);
+ if (r < 0)
+ return r;
+
+ r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
+ if (r < 0)
+ return r;
+
+ r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
+ TAKE_FD(fd);
+ if (r < 0)
+ return r;
+
+ *ret_saved_stdin = TAKE_FD(saved_stdin);
+ *ret_saved_stdout = TAKE_FD(saved_stdout);
+ return 0;
+}
+
+static void write_confirm_error_fd(int err, int fd, const Unit *u) {
+ assert(err < 0);
+
+ if (err == -ETIMEDOUT)
+ dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
+ else {
+ errno = -err;
+ dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
+ }
+}
+
+static void write_confirm_error(int err, const char *vc, const Unit *u) {
+ _cleanup_close_ int fd = -1;
+
+ assert(vc);
+
+ fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0)
+ return;
+
+ write_confirm_error_fd(err, fd, u);
+}
+
+static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
+ int r = 0;
+
+ assert(saved_stdin);
+ assert(saved_stdout);
+
+ release_terminal();
+
+ if (*saved_stdin >= 0)
+ if (dup2(*saved_stdin, STDIN_FILENO) < 0)
+ r = -errno;
+
+ if (*saved_stdout >= 0)
+ if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
+ r = -errno;
+
+ *saved_stdin = safe_close(*saved_stdin);
+ *saved_stdout = safe_close(*saved_stdout);
+
+ return r;
+}
+
+enum {
+ CONFIRM_PRETEND_FAILURE = -1,
+ CONFIRM_PRETEND_SUCCESS = 0,
+ CONFIRM_EXECUTE = 1,
+};
+
+static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
+ int saved_stdout = -1, saved_stdin = -1, r;
+ _cleanup_free_ char *e = NULL;
+ char c;
+
+ /* For any internal errors, assume a positive response. */
+ r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
+ if (r < 0) {
+ write_confirm_error(r, vc, u);
+ return CONFIRM_EXECUTE;
+ }
+
+ /* confirm_spawn might have been disabled while we were sleeping. */
+ if (manager_is_confirm_spawn_disabled(u->manager)) {
+ r = 1;
+ goto restore_stdio;
+ }
+
+ e = ellipsize(cmdline, 60, 100);
+ if (!e) {
+ log_oom();
+ r = CONFIRM_EXECUTE;
+ goto restore_stdio;
+ }
+
+ for (;;) {
+ r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
+ if (r < 0) {
+ write_confirm_error_fd(r, STDOUT_FILENO, u);
+ r = CONFIRM_EXECUTE;
+ goto restore_stdio;
+ }
+
+ switch (c) {
+ case 'c':
+ printf("Resuming normal execution.\n");
+ manager_disable_confirm_spawn();
+ r = 1;
+ break;
+ case 'D':
+ unit_dump(u, stdout, " ");
+ continue; /* ask again */
+ case 'f':
+ printf("Failing execution.\n");
+ r = CONFIRM_PRETEND_FAILURE;
+ break;
+ case 'h':
+ printf(" c - continue, proceed without asking anymore\n"
+ " D - dump, show the state of the unit\n"
+ " f - fail, don't execute the command and pretend it failed\n"
+ " h - help\n"
+ " i - info, show a short summary of the unit\n"
+ " j - jobs, show jobs that are in progress\n"
+ " s - skip, don't execute the command and pretend it succeeded\n"
+ " y - yes, execute the command\n");
+ continue; /* ask again */
+ case 'i':
+ printf(" Description: %s\n"
+ " Unit: %s\n"
+ " Command: %s\n",
+ u->id, u->description, cmdline);
+ continue; /* ask again */
+ case 'j':
+ manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
+ continue; /* ask again */
+ case 'n':
+ /* 'n' was removed in favor of 'f'. */
+ printf("Didn't understand 'n', did you mean 'f'?\n");
+ continue; /* ask again */
+ case 's':
+ printf("Skipping execution.\n");
+ r = CONFIRM_PRETEND_SUCCESS;
+ break;
+ case 'y':
+ r = CONFIRM_EXECUTE;
+ break;
+ default:
+ assert_not_reached();
+ }
+ break;
+ }
+
+restore_stdio:
+ restore_confirm_stdio(&saved_stdin, &saved_stdout);
+ return r;
+}
+
+static int get_fixed_user(const ExecContext *c, const char **user,
+ uid_t *uid, gid_t *gid,
+ const char **home, const char **shell) {
+ int r;
+ const char *name;
+
+ assert(c);
+
+ if (!c->user)
+ return 0;
+
+ /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+ * (i.e. are "/" or "/bin/nologin"). */
+
+ name = c->user;
+ r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
+ if (r < 0)
+ return r;
+
+ *user = name;
+ return 0;
+}
+
+static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
+ int r;
+ const char *name;
+
+ assert(c);
+
+ if (!c->group)
+ return 0;
+
+ name = c->group;
+ r = get_group_creds(&name, gid, 0);
+ if (r < 0)
+ return r;
+
+ *group = name;
+ return 0;
+}
+
+static int get_supplementary_groups(const ExecContext *c, const char *user,
+ const char *group, gid_t gid,
+ gid_t **supplementary_gids, int *ngids) {
+ int r, k = 0;
+ int ngroups_max;
+ bool keep_groups = false;
+ gid_t *groups = NULL;
+ _cleanup_free_ gid_t *l_gids = NULL;
+
+ assert(c);
+
+ /*
+ * If user is given, then lookup GID and supplementary groups list.
+ * We avoid NSS lookups for gid=0. Also we have to initialize groups
+ * here and as early as possible so we keep the list of supplementary
+ * groups of the caller.
+ */
+ if (user && gid_is_valid(gid) && gid != 0) {
+ /* First step, initialize groups from /etc/groups */
+ if (initgroups(user, gid) < 0)
+ return -errno;
+
+ keep_groups = true;
+ }
+
+ if (strv_isempty(c->supplementary_groups))
+ return 0;
+
+ /*
+ * If SupplementaryGroups= was passed then NGROUPS_MAX has to
+ * be positive, otherwise fail.
+ */
+ errno = 0;
+ ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
+ if (ngroups_max <= 0)
+ return errno_or_else(EOPNOTSUPP);
+
+ l_gids = new(gid_t, ngroups_max);
+ if (!l_gids)
+ return -ENOMEM;
+
+ if (keep_groups) {
+ /*
+ * Lookup the list of groups that the user belongs to, we
+ * avoid NSS lookups here too for gid=0.
+ */
+ k = ngroups_max;
+ if (getgrouplist(user, gid, l_gids, &k) < 0)
+ return -EINVAL;
+ } else
+ k = 0;
+
+ STRV_FOREACH(i, c->supplementary_groups) {
+ const char *g;
+
+ if (k >= ngroups_max)
+ return -E2BIG;
+
+ g = *i;
+ r = get_group_creds(&g, l_gids+k, 0);
+ if (r < 0)
+ return r;
+
+ k++;
+ }
+
+ /*
+ * Sets ngids to zero to drop all supplementary groups, happens
+ * when we are under root and SupplementaryGroups= is empty.
+ */
+ if (k == 0) {
+ *ngids = 0;
+ return 0;
+ }
+
+ /* Otherwise get the final list of supplementary groups */
+ groups = memdup(l_gids, sizeof(gid_t) * k);
+ if (!groups)
+ return -ENOMEM;
+
+ *supplementary_gids = groups;
+ *ngids = k;
+
+ groups = NULL;
+
+ return 0;
+}
+
+static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
+ int r;
+
+ /* Handle SupplementaryGroups= if it is not empty */
+ if (ngids > 0) {
+ r = maybe_setgroups(ngids, supplementary_gids);
+ if (r < 0)
+ return r;
+ }
+
+ if (gid_is_valid(gid)) {
+ /* Then set our gids */
+ if (setresgid(gid, gid, gid) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int set_securebits(int bits, int mask) {
+ int current, applied;
+ current = prctl(PR_GET_SECUREBITS);
+ if (current < 0)
+ return -errno;
+ /* Clear all securebits defined in mask and set bits */
+ applied = (current & ~mask) | bits;
+ if (current == applied)
+ return 0;
+ if (prctl(PR_SET_SECUREBITS, applied) < 0)
+ return -errno;
+ return 1;
+}
+
+static int enforce_user(const ExecContext *context, uid_t uid) {
+ assert(context);
+ int r;
+
+ if (!uid_is_valid(uid))
+ return 0;
+
+ /* Sets (but doesn't look up) the uid and make sure we keep the
+ * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
+ * required, so we also need keep-caps in this case.
+ */
+
+ if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
+
+ /* First step: If we need to keep capabilities but
+ * drop privileges we need to make sure we keep our
+ * caps, while we drop privileges. */
+ if (uid != 0) {
+ /* Add KEEP_CAPS to the securebits */
+ r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Second step: actually set the uids */
+ if (setresuid(uid, uid, uid) < 0)
+ return -errno;
+
+ /* At this point we should have all necessary capabilities but
+ are otherwise a normal user. However, the caps might got
+ corrupted due to the setresuid() so we need clean them up
+ later. This is done outside of this call. */
+
+ return 0;
+}
+
+#if HAVE_PAM
+
+static int null_conv(
+ int num_msg,
+ const struct pam_message **msg,
+ struct pam_response **resp,
+ void *appdata_ptr) {
+
+ /* We don't support conversations */
+
+ return PAM_CONV_ERR;
+}
+
+#endif
+
+static int setup_pam(
+ const char *name,
+ const char *user,
+ uid_t uid,
+ gid_t gid,
+ const char *tty,
+ char ***env, /* updated on success */
+ const int fds[], size_t n_fds) {
+
+#if HAVE_PAM
+
+ static const struct pam_conv conv = {
+ .conv = null_conv,
+ .appdata_ptr = NULL
+ };
+
+ _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+ _cleanup_strv_free_ char **e = NULL;
+ pam_handle_t *handle = NULL;
+ sigset_t old_ss;
+ int pam_code = PAM_SUCCESS, r;
+ bool close_session = false;
+ pid_t pam_pid = 0, parent_pid;
+ int flags = 0;
+
+ assert(name);
+ assert(user);
+ assert(env);
+
+ /* We set up PAM in the parent process, then fork. The child
+ * will then stay around until killed via PR_GET_PDEATHSIG or
+ * systemd via the cgroup logic. It will then remove the PAM
+ * session again. The parent process will exec() the actual
+ * daemon. We do things this way to ensure that the main PID
+ * of the daemon is the one we initially fork()ed. */
+
+ r = barrier_create(&barrier);
+ if (r < 0)
+ goto fail;
+
+ if (log_get_max_level() < LOG_DEBUG)
+ flags |= PAM_SILENT;
+
+ pam_code = pam_start(name, user, &conv, &handle);
+ if (pam_code != PAM_SUCCESS) {
+ handle = NULL;
+ goto fail;
+ }
+
+ if (!tty) {
+ _cleanup_free_ char *q = NULL;
+
+ /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
+ * out if that's the case, and read the TTY off it. */
+
+ if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
+ tty = strjoina("/dev/", q);
+ }
+
+ if (tty) {
+ pam_code = pam_set_item(handle, PAM_TTY, tty);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+ }
+
+ STRV_FOREACH(nv, *env) {
+ pam_code = pam_putenv(handle, *nv);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+ }
+
+ pam_code = pam_acct_mgmt(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+
+ pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
+ if (pam_code != PAM_SUCCESS)
+ log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
+
+ pam_code = pam_open_session(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+
+ close_session = true;
+
+ e = pam_getenvlist(handle);
+ if (!e) {
+ pam_code = PAM_BUF_ERR;
+ goto fail;
+ }
+
+ /* Block SIGTERM, so that we know that it won't get lost in the child */
+
+ assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
+
+ parent_pid = getpid_cached();
+
+ r = safe_fork("(sd-pam)", 0, &pam_pid);
+ if (r < 0)
+ goto fail;
+ if (r == 0) {
+ int sig, ret = EXIT_PAM;
+
+ /* The child's job is to reset the PAM session on termination */
+ barrier_set_role(&barrier, BARRIER_CHILD);
+
+ /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
+ * those fds are open here that have been opened by PAM. */
+ (void) close_many(fds, n_fds);
+
+ /* Drop privileges - we don't need any to pam_close_session and this will make
+ * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
+ * threads to fail to exit normally */
+
+ r = maybe_setgroups(0, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
+ if (setresgid(gid, gid, gid) < 0)
+ log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
+ if (setresuid(uid, uid, uid) < 0)
+ log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
+
+ (void) ignore_signals(SIGPIPE);
+
+ /* Wait until our parent died. This will only work if the above setresuid() succeeds,
+ * otherwise the kernel will not allow unprivileged parents kill their privileged children
+ * this way. We rely on the control groups kill logic to do the rest for us. */
+ if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
+ goto child_finish;
+
+ /* Tell the parent that our setup is done. This is especially important regarding dropping
+ * privileges. Otherwise, unit setup might race against our setresuid(2) call.
+ *
+ * If the parent aborted, we'll detect this below, hence ignore return failure here. */
+ (void) barrier_place(&barrier);
+
+ /* Check if our parent process might already have died? */
+ if (getppid() == parent_pid) {
+ sigset_t ss;
+
+ assert_se(sigemptyset(&ss) >= 0);
+ assert_se(sigaddset(&ss, SIGTERM) >= 0);
+
+ for (;;) {
+ if (sigwait(&ss, &sig) < 0) {
+ if (errno == EINTR)
+ continue;
+
+ goto child_finish;
+ }
+
+ assert(sig == SIGTERM);
+ break;
+ }
+ }
+
+ pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
+ if (pam_code != PAM_SUCCESS)
+ goto child_finish;
+
+ /* If our parent died we'll end the session */
+ if (getppid() != parent_pid) {
+ pam_code = pam_close_session(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto child_finish;
+ }
+
+ ret = 0;
+
+ child_finish:
+ /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
+ * know about this. See pam_end(3) */
+ (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
+ _exit(ret);
+ }
+
+ barrier_set_role(&barrier, BARRIER_PARENT);
+
+ /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
+ * here. */
+ handle = NULL;
+
+ /* Unblock SIGTERM again in the parent */
+ assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
+
+ /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
+ * this fd around. */
+ closelog();
+
+ /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
+ * recover. However, warn loudly if it happens. */
+ if (!barrier_place_and_sync(&barrier))
+ log_error("PAM initialization failed");
+
+ return strv_free_and_replace(*env, e);
+
+fail:
+ if (pam_code != PAM_SUCCESS) {
+ log_error("PAM failed: %s", pam_strerror(handle, pam_code));
+ r = -EPERM; /* PAM errors do not map to errno */
+ } else
+ log_error_errno(r, "PAM failed: %m");
+
+ if (handle) {
+ if (close_session)
+ pam_code = pam_close_session(handle, flags);
+
+ (void) pam_end(handle, pam_code | flags);
+ }
+
+ closelog();
+ return r;
+#else
+ return 0;
+#endif
+}
+
+static void rename_process_from_path(const char *path) {
+ char process_name[11];
+ const char *p;
+ size_t l;
+
+ /* This resulting string must fit in 10 chars (i.e. the length
+ * of "/sbin/init") to look pretty in /bin/ps */
+
+ p = basename(path);
+ if (isempty(p)) {
+ rename_process("(...)");
+ return;
+ }
+
+ l = strlen(p);
+ if (l > 8) {
+ /* The end of the process name is usually more
+ * interesting, since the first bit might just be
+ * "systemd-" */
+ p = p + l - 8;
+ l = 8;
+ }
+
+ process_name[0] = '(';
+ memcpy(process_name+1, p, l);
+ process_name[1+l] = ')';
+ process_name[1+l+1] = 0;
+
+ rename_process(process_name);
+}
+
+static bool context_has_address_families(const ExecContext *c) {
+ assert(c);
+
+ return c->address_families_allow_list ||
+ !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_allow_list ||
+ !hashmap_isempty(c->syscall_filter);
+}
+
+static bool context_has_syscall_logs(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_log_allow_list ||
+ !hashmap_isempty(c->syscall_log);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+ assert(c);
+
+ if (c->no_new_privileges)
+ return true;
+
+ if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+ return false;
+
+ /* We need NNP if we have any form of seccomp and are unprivileged */
+ return c->lock_personality ||
+ c->memory_deny_write_execute ||
+ c->private_devices ||
+ c->protect_clock ||
+ c->protect_hostname ||
+ c->protect_kernel_tunables ||
+ c->protect_kernel_modules ||
+ c->protect_kernel_logs ||
+ context_has_address_families(c) ||
+ exec_context_restrict_namespaces_set(c) ||
+ c->restrict_realtime ||
+ c->restrict_suid_sgid ||
+ !set_isempty(c->syscall_archs) ||
+ context_has_syscall_filters(c) ||
+ context_has_syscall_logs(c);
+}
+
+static bool exec_context_has_credentials(const ExecContext *context) {
+
+ assert(context);
+
+ return !hashmap_isempty(context->set_credentials) ||
+ !hashmap_isempty(context->load_credentials);
+}
+
+#if HAVE_SECCOMP
+
+static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
+
+ if (is_seccomp_available())
+ return false;
+
+ log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
+ return true;
+}
+
+static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
+ uint32_t negative_action, default_action, action;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (!context_has_syscall_filters(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "SystemCallFilter="))
+ return 0;
+
+ negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
+
+ if (c->syscall_allow_list) {
+ default_action = negative_action;
+ action = SCMP_ACT_ALLOW;
+ } else {
+ default_action = SCMP_ACT_ALLOW;
+ action = negative_action;
+ }
+
+ if (needs_ambient_hack) {
+ r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
+ if (r < 0)
+ return r;
+ }
+
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
+}
+
+static int apply_syscall_log(const Unit* u, const ExecContext *c) {
+#ifdef SCMP_ACT_LOG
+ uint32_t default_action, action;
+#endif
+
+ assert(u);
+ assert(c);
+
+ if (!context_has_syscall_logs(c))
+ return 0;
+
+#ifdef SCMP_ACT_LOG
+ if (skip_seccomp_unavailable(u, "SystemCallLog="))
+ return 0;
+
+ if (c->syscall_log_allow_list) {
+ /* Log nothing but the ones listed */
+ default_action = SCMP_ACT_ALLOW;
+ action = SCMP_ACT_LOG;
+ } else {
+ /* Log everything but the ones listed */
+ default_action = SCMP_ACT_LOG;
+ action = SCMP_ACT_ALLOW;
+ }
+
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
+#else
+ /* old libseccomp */
+ log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
+ return 0;
+#endif
+}
+
+static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (set_isempty(c->syscall_archs))
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
+ return 0;
+
+ return seccomp_restrict_archs(c->syscall_archs);
+}
+
+static int apply_address_families(const Unit* u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!context_has_address_families(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+ return 0;
+
+ return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
+}
+
+static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!c->memory_deny_write_execute)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+ return 0;
+
+ return seccomp_memory_deny_write_execute();
+}
+
+static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!c->restrict_realtime)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+ return 0;
+
+ return seccomp_restrict_realtime();
+}
+
+static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!c->restrict_suid_sgid)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
+ return 0;
+
+ return seccomp_restrict_suid_sgid();
+}
+
+static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+ * let's protect even those systems where this is left on in the kernel. */
+
+ if (!c->protect_kernel_tunables)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+ return 0;
+
+ return seccomp_protect_sysctl();
+}
+
+static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ /* Turn off module syscalls on ProtectKernelModules=yes */
+
+ if (!c->protect_kernel_modules)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!c->protect_kernel_logs)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
+ return 0;
+
+ return seccomp_protect_syslog();
+}
+
+static int apply_protect_clock(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!c->protect_clock)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "ProtectClock="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_private_devices(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+
+ if (!c->private_devices)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "PrivateDevices="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!exec_context_restrict_namespaces_set(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
+ return 0;
+
+ return seccomp_restrict_namespaces(c->restrict_namespaces);
+}
+
+static int apply_lock_personality(const Unit* u, const ExecContext *c) {
+ unsigned long personality;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (!c->lock_personality)
+ return 0;
+
+ if (skip_seccomp_unavailable(u, "LockPersonality="))
+ return 0;
+
+ personality = c->personality;
+
+ /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
+ if (personality == PERSONALITY_INVALID) {
+
+ r = opinionated_personality(&personality);
+ if (r < 0)
+ return r;
+ }
+
+ return seccomp_lock_personality(personality);
+}
+
+#endif
+
+#if HAVE_LIBBPF
+static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
+ assert(u);
+ assert(c);
+
+ if (!exec_context_restrict_filesystems_set(c))
+ return 0;
+
+ if (!u->manager->restrict_fs) {
+ /* LSM BPF is unsupported or lsm_bpf_setup failed */
+ log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
+ return 0;
+ }
+
+ return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
+}
+#endif
+
+static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
+ assert(u);
+ assert(c);
+
+ if (!c->protect_hostname)
+ return 0;
+
+ if (ns_type_supported(NAMESPACE_UTS)) {
+ if (unshare(CLONE_NEWUTS) < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
+ *ret_exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
+ }
+
+ log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
+ }
+ } else
+ log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
+
+#if HAVE_SECCOMP
+ int r;
+
+ if (skip_seccomp_unavailable(u, "ProtectHostname="))
+ return 0;
+
+ r = seccomp_protect_hostname();
+ if (r < 0) {
+ *ret_exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
+ }
+#endif
+
+ return 0;
+}
+
+static void do_idle_pipe_dance(int idle_pipe[static 4]) {
+ assert(idle_pipe);
+
+ idle_pipe[1] = safe_close(idle_pipe[1]);
+ idle_pipe[2] = safe_close(idle_pipe[2]);
+
+ if (idle_pipe[0] >= 0) {
+ int r;
+
+ r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
+
+ if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
+ ssize_t n;
+
+ /* Signal systemd that we are bored and want to continue. */
+ n = write(idle_pipe[3], "x", 1);
+ if (n > 0)
+ /* Wait for systemd to react to the signal above. */
+ (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
+ }
+
+ idle_pipe[0] = safe_close(idle_pipe[0]);
+
+ }
+
+ idle_pipe[3] = safe_close(idle_pipe[3]);
+}
+
+static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
+
+static int build_environment(
+ const Unit *u,
+ const ExecContext *c,
+ const ExecParameters *p,
+ size_t n_fds,
+ const char *home,
+ const char *username,
+ const char *shell,
+ dev_t journal_stream_dev,
+ ino_t journal_stream_ino,
+ char ***ret) {
+
+ _cleanup_strv_free_ char **our_env = NULL;
+ size_t n_env = 0;
+ char *x;
+
+ assert(u);
+ assert(c);
+ assert(p);
+ assert(ret);
+
+#define N_ENV_VARS 17
+ our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+ if (!our_env)
+ return -ENOMEM;
+
+ if (n_fds > 0) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ joined = strv_join(p->fd_names, ":");
+ if (!joined)
+ return -ENOMEM;
+
+ x = strjoin("LISTEN_FDNAMES=", joined);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
+ if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
+ * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
+ * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
+ if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
+ x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if (home) {
+ x = strjoin("HOME=", home);
+ if (!x)
+ return -ENOMEM;
+
+ path_simplify(x + 5);
+ our_env[n_env++] = x;
+ }
+
+ if (username) {
+ x = strjoin("LOGNAME=", username);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ x = strjoin("USER=", username);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if (shell) {
+ x = strjoin("SHELL=", shell);
+ if (!x)
+ return -ENOMEM;
+
+ path_simplify(x + 6);
+ our_env[n_env++] = x;
+ }
+
+ if (!sd_id128_is_null(u->invocation_id)) {
+ if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (exec_context_needs_term(c)) {
+ const char *tty_path, *term = NULL;
+
+ tty_path = exec_context_tty_path(c);
+
+ /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
+ * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
+ * container manager passes to PID 1 ends up all the way in the console login shown. */
+
+ if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
+ term = getenv("TERM");
+
+ if (!term)
+ term = default_term_for_tty(tty_path);
+
+ x = strjoin("TERM=", term);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if (journal_stream_dev != 0 && journal_stream_ino != 0) {
+ if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (c->log_namespace) {
+ x = strjoin("LOG_NAMESPACE=", c->log_namespace);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ _cleanup_free_ char *joined = NULL;
+ const char *n;
+
+ if (!p->prefix[t])
+ continue;
+
+ if (c->directories[t].n_items == 0)
+ continue;
+
+ n = exec_directory_env_name_to_string(t);
+ if (!n)
+ continue;
+
+ for (size_t i = 0; i < c->directories[t].n_items; i++) {
+ _cleanup_free_ char *prefixed = NULL;
+
+ prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
+ if (!prefixed)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&joined, ":", prefixed))
+ return -ENOMEM;
+ }
+
+ x = strjoin(n, "=", joined);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
+ x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+
+ our_env[n_env++] = NULL;
+ assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+#undef N_ENV_VARS
+
+ *ret = TAKE_PTR(our_env);
+
+ return 0;
+}
+
+static int build_pass_environment(const ExecContext *c, char ***ret) {
+ _cleanup_strv_free_ char **pass_env = NULL;
+ size_t n_env = 0;
+
+ STRV_FOREACH(i, c->pass_environment) {
+ _cleanup_free_ char *x = NULL;
+ char *v;
+
+ v = getenv(*i);
+ if (!v)
+ continue;
+ x = strjoin(*i, "=", v);
+ if (!x)
+ return -ENOMEM;
+
+ if (!GREEDY_REALLOC(pass_env, n_env + 2))
+ return -ENOMEM;
+
+ pass_env[n_env++] = TAKE_PTR(x);
+ pass_env[n_env] = NULL;
+ }
+
+ *ret = TAKE_PTR(pass_env);
+
+ return 0;
+}
+
+bool exec_needs_mount_namespace(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const ExecRuntime *runtime) {
+
+ assert(context);
+
+ if (context->root_image)
+ return true;
+
+ if (!strv_isempty(context->read_write_paths) ||
+ !strv_isempty(context->read_only_paths) ||
+ !strv_isempty(context->inaccessible_paths) ||
+ !strv_isempty(context->exec_paths) ||
+ !strv_isempty(context->no_exec_paths))
+ return true;
+
+ if (context->n_bind_mounts > 0)
+ return true;
+
+ if (context->n_temporary_filesystems > 0)
+ return true;
+
+ if (context->n_mount_images > 0)
+ return true;
+
+ if (context->n_extension_images > 0)
+ return true;
+
+ if (!strv_isempty(context->extension_directories))
+ return true;
+
+ if (!IN_SET(context->mount_flags, 0, MS_SHARED))
+ return true;
+
+ if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
+ return true;
+
+ if (context->private_devices ||
+ context->private_mounts ||
+ context->protect_system != PROTECT_SYSTEM_NO ||
+ context->protect_home != PROTECT_HOME_NO ||
+ context->protect_kernel_tunables ||
+ context->protect_kernel_modules ||
+ context->protect_kernel_logs ||
+ context->protect_control_groups ||
+ context->protect_proc != PROTECT_PROC_DEFAULT ||
+ context->proc_subset != PROC_SUBSET_ALL ||
+ context->private_ipc ||
+ context->ipc_namespace_path)
+ return true;
+
+ if (context->root_directory) {
+ if (exec_context_get_effective_mount_apivfs(context))
+ return true;
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (params && !params->prefix[t])
+ continue;
+
+ if (context->directories[t].n_items > 0)
+ return true;
+ }
+ }
+
+ if (context->dynamic_user &&
+ (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+ context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+ context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
+ return true;
+
+ if (context->log_namespace)
+ return true;
+
+ return false;
+}
+
+static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
+ _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+ _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
+ _cleanup_close_ int unshare_ready_fd = -1;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ uint64_t c = 1;
+ ssize_t n;
+ int r;
+
+ /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
+ * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
+ * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+ * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+ * which waits for the parent to create the new user namespace while staying in the original namespace. The
+ * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+ * continues execution normally.
+ * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
+ * does not need CAP_SETUID to write the single line mapping to itself. */
+
+ /* Can only set up multiple mappings with CAP_SETUID. */
+ if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
+ r = asprintf(&uid_map,
+ UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
+ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
+ ouid, ouid, uid, uid);
+ else
+ r = asprintf(&uid_map,
+ UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
+ ouid, ouid);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Can only set up multiple mappings with CAP_SETGID. */
+ if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
+ r = asprintf(&gid_map,
+ GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
+ GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
+ ogid, ogid, gid, gid);
+ else
+ r = asprintf(&gid_map,
+ GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
+ ogid, ogid);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Create a communication channel so that the parent can tell the child when it finished creating the user
+ * namespace. */
+ unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+ if (unshare_ready_fd < 0)
+ return -errno;
+
+ /* Create a communication channel so that the child can tell the parent a proper error code in case it
+ * failed. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return -errno;
+
+ r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ _cleanup_close_ int fd = -1;
+ const char *a;
+ pid_t ppid;
+
+ /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+ * here, after the parent opened its own user namespace. */
+
+ ppid = getppid();
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* Wait until the parent unshared the user namespace */
+ if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* Disable the setgroups() system call in the child user namespace, for good. */
+ a = procfs_file_alloca(ppid, "setgroups");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* If the file is missing the kernel is too old, let's continue anyway. */
+ } else {
+ if (write(fd, "deny\n", 5) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ fd = safe_close(fd);
+ }
+
+ /* First write the GID map */
+ a = procfs_file_alloca(ppid, "gid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, gid_map, strlen(gid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ fd = safe_close(fd);
+
+ /* The write the UID map */
+ a = procfs_file_alloca(ppid, "uid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, uid_map, strlen(uid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(errno_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+
+ if (unshare(CLONE_NEWUSER) < 0)
+ return -errno;
+
+ /* Let the child know that the namespace is ready now */
+ if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+ return -errno;
+
+ /* Try to read an error code from the child */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return -errno;
+ if (n == sizeof(r)) { /* an error code was sent to us */
+ if (r < 0)
+ return r;
+ return -EIO;
+ }
+ if (n != 0) /* on success we should have read 0 bytes */
+ return -EIO;
+
+ r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
+ if (r < 0)
+ return r;
+ if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
+ return -EIO;
+
+ return 0;
+}
+
+static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
+ if (!context->dynamic_user)
+ return false;
+
+ if (type == EXEC_DIRECTORY_CONFIGURATION)
+ return false;
+
+ if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
+ return false;
+
+ return true;
+}
+
+static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
+ _cleanup_free_ char *src_abs = NULL;
+ int r;
+
+ assert(source);
+
+ src_abs = path_join(root, source);
+ if (!src_abs)
+ return -ENOMEM;
+
+ STRV_FOREACH(dst, symlinks) {
+ _cleanup_free_ char *dst_abs = NULL;
+
+ dst_abs = path_join(root, *dst);
+ if (!dst_abs)
+ return -ENOMEM;
+
+ r = mkdir_parents_label(dst_abs, 0755);
+ if (r < 0)
+ return r;
+
+ r = symlink_idempotent(src_abs, dst_abs, true);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int setup_exec_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ uid_t uid,
+ gid_t gid,
+ ExecDirectoryType type,
+ bool needs_mount_namespace,
+ int *exit_status) {
+
+ static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
+ [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
+ [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
+ [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
+ [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
+ };
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
+ assert(exit_status);
+
+ if (!params->prefix[type])
+ return 0;
+
+ if (params->flags & EXEC_CHOWN_DIRECTORIES) {
+ if (!uid_is_valid(uid))
+ uid = 0;
+ if (!gid_is_valid(gid))
+ gid = 0;
+ }
+
+ for (size_t i = 0; i < context->directories[type].n_items; i++) {
+ _cleanup_free_ char *p = NULL, *pp = NULL;
+
+ p = path_join(params->prefix[type], context->directories[type].items[i].path);
+ if (!p) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = mkdir_parents_label(p, 0755);
+ if (r < 0)
+ goto fail;
+
+ if (exec_directory_is_private(context, type)) {
+ /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
+ * case we want to avoid leaving a directory around fully accessible that is owned by
+ * a dynamic user whose UID is later on reused. To lock this down we use the same
+ * trick used by container managers to prohibit host users to get access to files of
+ * the same UID in containers: we place everything inside a directory that has an
+ * access mode of 0700 and is owned root:root, so that it acts as security boundary
+ * for unprivileged host code. We then use fs namespacing to make this directory
+ * permeable for the service itself.
+ *
+ * Specifically: for a service which wants a special directory "foo/" we first create
+ * a directory "private/" with access mode 0700 owned by root:root. Then we place
+ * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
+ * "private/foo". This way, privileged host users can access "foo/" as usual, but
+ * unprivileged host users can't look into it. Inside of the namespace of the unit
+ * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
+ * "private/foo/" is mounted under the same name, thus disabling the access boundary
+ * for the service and making sure it only gets access to the dirs it needs but no
+ * others. Tricky? Yes, absolutely, but it works!
+ *
+ * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
+ * to be owned by the service itself.
+ *
+ * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
+ * for sharing files or sockets with other services. */
+
+ pp = path_join(params->prefix[type], "private");
+ if (!pp) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
+ r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
+ if (r < 0)
+ goto fail;
+
+ if (!path_extend(&pp, context->directories[type].items[i].path)) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* Create all directories between the configured directory and this private root, and mark them 0755 */
+ r = mkdir_parents_label(pp, 0755);
+ if (r < 0)
+ goto fail;
+
+ if (is_dir(p, false) > 0 &&
+ (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
+
+ /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
+ * it over. Most likely the service has been upgraded from one that didn't use
+ * DynamicUser=1, to one that does. */
+
+ log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
+ "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
+ exec_directory_type_to_string(type), p, pp);
+
+ if (rename(p, pp) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ } else {
+ /* Otherwise, create the actual directory for the service */
+
+ r = mkdir_label(pp, context->directories[type].mode);
+ if (r < 0 && r != -EEXIST)
+ goto fail;
+ }
+
+ if (!context->directories[type].items[i].only_create) {
+ /* And link it up from the original place.
+ * Notes
+ * 1) If a mount namespace is going to be used, then this symlink remains on
+ * the host, and a new one for the child namespace will be created later.
+ * 2) It is not necessary to create this symlink when one of its parent
+ * directories is specified and already created. E.g.
+ * StateDirectory=foo foo/bar
+ * In that case, the inode points to pp and p for "foo/bar" are the same:
+ * pp = "/var/lib/private/foo/bar"
+ * p = "/var/lib/foo/bar"
+ * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
+ * we do not need to create the symlink, but we cannot create the symlink.
+ * See issue #24783. */
+ r = symlink_idempotent(pp, p, true);
+ if (r < 0)
+ goto fail;
+ }
+
+ } else {
+ _cleanup_free_ char *target = NULL;
+
+ if (type != EXEC_DIRECTORY_CONFIGURATION &&
+ readlink_and_make_absolute(p, &target) >= 0) {
+ _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
+
+ /* This already exists and is a symlink? Interesting. Maybe it's one created
+ * by DynamicUser=1 (see above)?
+ *
+ * We do this for all directory types except for ConfigurationDirectory=,
+ * since they all support the private/ symlink logic at least in some
+ * configurations, see above. */
+
+ r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
+ if (r < 0)
+ goto fail;
+
+ q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
+ if (!q) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* /var/lib or friends may be symlinks. So, let's chase them also. */
+ r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
+ if (r < 0)
+ goto fail;
+
+ if (path_equal(q_resolved, target_resolved)) {
+
+ /* Hmm, apparently DynamicUser= was once turned on for this service,
+ * but is no longer. Let's move the directory back up. */
+
+ log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
+ "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
+ exec_directory_type_to_string(type), q, p);
+
+ if (unlink(p) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (rename(q, p) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ }
+ }
+
+ r = mkdir_label(p, context->directories[type].mode);
+ if (r < 0) {
+ if (r != -EEXIST)
+ goto fail;
+
+ if (type == EXEC_DIRECTORY_CONFIGURATION) {
+ struct stat st;
+
+ /* Don't change the owner/access mode of the configuration directory,
+ * as in the common case it is not written to by a service, and shall
+ * not be writable. */
+
+ if (stat(p, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ /* Still complain if the access mode doesn't match */
+ if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
+ log_warning("%s \'%s\' already exists but the mode is different. "
+ "(File system: %o %sMode: %o)",
+ exec_directory_type_to_string(type), context->directories[type].items[i].path,
+ st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
+
+ continue;
+ }
+ }
+ }
+
+ /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
+ * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
+ * current UID/GID ownership.) */
+ r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
+ if (r < 0)
+ goto fail;
+
+ /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
+ * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
+ * assignments to exist. */
+ r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
+ if (r < 0)
+ goto fail;
+ }
+
+ /* If we are not going to run in a namespace, set up the symlinks - otherwise
+ * they are set up later, to allow configuring empty var/run/etc. */
+ if (!needs_mount_namespace)
+ for (size_t i = 0; i < context->directories[type].n_items; i++) {
+ r = create_many_symlinks(params->prefix[type],
+ context->directories[type].items[i].path,
+ context->directories[type].items[i].symlinks);
+ if (r < 0)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ *exit_status = exit_status_table[type];
+ return r;
+}
+
+static int write_credential(
+ int dfd,
+ const char *id,
+ const void *data,
+ size_t size,
+ uid_t uid,
+ bool ownership_ok) {
+
+ _cleanup_(unlink_and_freep) char *tmp = NULL;
+ _cleanup_close_ int fd = -1;
+ int r;
+
+ r = tempfn_random_child("", "cred", &tmp);
+ if (r < 0)
+ return r;
+
+ fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
+ if (fd < 0) {
+ tmp = mfree(tmp);
+ return -errno;
+ }
+
+ r = loop_write(fd, data, size, /* do_poll = */ false);
+ if (r < 0)
+ return r;
+
+ if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
+ return -errno;
+
+ if (uid_is_valid(uid) && uid != getuid()) {
+ r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
+ * to express: that the user gets read access and nothing
+ * else. But if the backing fs can't support that (e.g. ramfs)
+ * then we can use file ownership instead. But that's only safe if
+ * we can then re-mount the whole thing read-only, so that the
+ * user can no longer chmod() the file to gain write access. */
+ return r;
+
+ if (fchown(fd, uid, GID_INVALID) < 0)
+ return -errno;
+ }
+ }
+
+ if (renameat(dfd, tmp, dfd, id) < 0)
+ return -errno;
+
+ tmp = mfree(tmp);
+ return 0;
+}
+
+static char **credential_search_path(
+ const ExecParameters *params,
+ bool encrypted) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(params);
+
+ /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
+ * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
+ * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
+
+ if (encrypted) {
+ if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
+ return NULL;
+
+ if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
+ return NULL;
+ }
+
+ if (params->received_credentials_directory)
+ if (strv_extend(&l, params->received_credentials_directory) < 0)
+ return NULL;
+
+ if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
+ return NULL;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *t = strv_join(l, ":");
+
+ log_debug("Credential search path is: %s", strempty(t));
+ }
+
+ return TAKE_PTR(l);
+}
+
+static int load_credential(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *id,
+ const char *path,
+ bool encrypted,
+ const char *unit,
+ int read_dfd,
+ int write_dfd,
+ uid_t uid,
+ bool ownership_ok,
+ uint64_t *left) {
+
+ ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
+ _cleanup_strv_free_ char **search_path = NULL;
+ _cleanup_(erase_and_freep) char *data = NULL;
+ _cleanup_free_ char *bindname = NULL;
+ const char *source = NULL;
+ bool missing_ok = true;
+ size_t size, add, maxsz;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(id);
+ assert(path);
+ assert(unit);
+ assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
+ assert(write_dfd >= 0);
+ assert(left);
+
+ if (read_dfd >= 0) {
+ /* If a directory fd is specified, then read the file directly from that dir. In this case we
+ * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
+ * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
+ * open it. */
+
+ if (!filename_is_valid(path)) /* safety check */
+ return -EINVAL;
+
+ missing_ok = true;
+ source = path;
+
+ } else if (path_is_absolute(path)) {
+ /* If this is an absolute path, read the data directly from it, and support AF_UNIX
+ * sockets */
+
+ if (!path_is_valid(path)) /* safety check */
+ return -EINVAL;
+
+ flags |= READ_FULL_FILE_CONNECT_SOCKET;
+
+ /* Pass some minimal info about the unit and the credential name we are looking to acquire
+ * via the source socket address in case we read off an AF_UNIX socket. */
+ if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
+ return -ENOMEM;
+
+ missing_ok = false;
+ source = path;
+
+ } else if (credential_name_valid(path)) {
+ /* If this is a relative path, take it as credential name relative to the credentials
+ * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
+ * are operating on a credential store, i.e. this is guaranteed to be regular files. */
+
+ search_path = credential_search_path(params, encrypted);
+ if (!search_path)
+ return -ENOMEM;
+
+ missing_ok = true;
+ } else
+ source = NULL;
+
+ if (encrypted)
+ flags |= READ_FULL_FILE_UNBASE64;
+
+ maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
+
+ if (search_path) {
+ STRV_FOREACH(d, search_path) {
+ _cleanup_free_ char *j = NULL;
+
+ j = path_join(*d, path);
+ if (!j)
+ return -ENOMEM;
+
+ r = read_full_file_full(
+ AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+ UINT64_MAX,
+ maxsz,
+ flags,
+ NULL,
+ &data, &size);
+ if (r != -ENOENT)
+ break;
+ }
+ } else if (source)
+ r = read_full_file_full(
+ read_dfd, source,
+ UINT64_MAX,
+ maxsz,
+ flags,
+ bindname,
+ &data, &size);
+ else
+ r = -ENOENT;
+
+ if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
+ /* Make a missing inherited credential non-fatal, let's just continue. After all apps
+ * will get clear errors if we don't pass such a missing credential on as they
+ * themselves will get ENOENT when trying to read them, which should not be much
+ * worse than when we handle the error here and make it fatal.
+ *
+ * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
+ * we are fine, too. */
+ log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
+ return 0;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read credential '%s': %m", path);
+
+ if (encrypted) {
+ _cleanup_free_ void *plaintext = NULL;
+ size_t plaintext_size = 0;
+
+ r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
+ if (r < 0)
+ return r;
+
+ free_and_replace(data, plaintext);
+ size = plaintext_size;
+ }
+
+ add = strlen(id) + size;
+ if (add > *left)
+ return -E2BIG;
+
+ r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write credential '%s': %m", id);
+
+ *left -= add;
+ return 0;
+}
+
+struct load_cred_args {
+ const ExecContext *context;
+ const ExecParameters *params;
+ bool encrypted;
+ const char *unit;
+ int dfd;
+ uid_t uid;
+ bool ownership_ok;
+ uint64_t *left;
+};
+
+static int load_cred_recurse_dir_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ struct load_cred_args *args = ASSERT_PTR(userdata);
+ _cleanup_free_ char *sub_id = NULL;
+ int r;
+
+ if (event != RECURSE_DIR_ENTRY)
+ return RECURSE_DIR_CONTINUE;
+
+ if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
+ return RECURSE_DIR_CONTINUE;
+
+ sub_id = strreplace(path, "/", "_");
+ if (!sub_id)
+ return -ENOMEM;
+
+ if (!credential_name_valid(sub_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
+
+ if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
+ log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
+ return RECURSE_DIR_CONTINUE;
+ }
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
+
+ r = load_credential(
+ args->context,
+ args->params,
+ sub_id,
+ de->d_name,
+ args->encrypted,
+ args->unit,
+ dir_fd,
+ args->dfd,
+ args->uid,
+ args->ownership_ok,
+ args->left);
+ if (r < 0)
+ return r;
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+static int acquire_credentials(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ const char *p,
+ uid_t uid,
+ bool ownership_ok) {
+
+ uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
+ _cleanup_close_ int dfd = -1;
+ ExecLoadCredential *lc;
+ ExecSetCredential *sc;
+ int r;
+
+ assert(context);
+ assert(p);
+
+ dfd = open(p, O_DIRECTORY|O_CLOEXEC);
+ if (dfd < 0)
+ return -errno;
+
+ r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
+ if (r < 0)
+ return r;
+
+ /* First, load credentials off disk (or acquire via AF_UNIX socket) */
+ HASHMAP_FOREACH(lc, context->load_credentials) {
+ _cleanup_close_ int sub_fd = -1;
+
+ /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
+ * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
+ * a regular file. Finally, if it's a relative path we will use it as a credential name to
+ * propagate a credential passed to us from further up. */
+
+ if (path_is_absolute(lc->path)) {
+ sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+ if (sub_fd < 0 && !IN_SET(errno,
+ ENOTDIR, /* Not a directory */
+ ENOENT)) /* Doesn't exist? */
+ return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
+ }
+
+ if (sub_fd < 0)
+ /* Regular file (incl. a credential passed in from higher up) */
+ r = load_credential(
+ context,
+ params,
+ lc->id,
+ lc->path,
+ lc->encrypted,
+ unit,
+ AT_FDCWD,
+ dfd,
+ uid,
+ ownership_ok,
+ &left);
+ else
+ /* Directory */
+ r = recurse_dir(
+ sub_fd,
+ /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+ load_cred_recurse_dir_cb,
+ &(struct load_cred_args) {
+ .context = context,
+ .params = params,
+ .encrypted = lc->encrypted,
+ .unit = unit,
+ .dfd = dfd,
+ .uid = uid,
+ .ownership_ok = ownership_ok,
+ .left = &left,
+ });
+ if (r < 0)
+ return r;
+ }
+
+ /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
+ * them, so that they can act as a "default" if the same credential is specified multiple times. */
+ HASHMAP_FOREACH(sc, context->set_credentials) {
+ _cleanup_(erase_and_freep) void *plaintext = NULL;
+ const char *data;
+ size_t size, add;
+
+ /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
+ * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
+ * slow and involved, hence it's nice to be able to skip that if the credential already
+ * exists anyway. */
+ if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+ continue;
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
+
+ if (sc->encrypted) {
+ r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
+ if (r < 0)
+ return r;
+
+ data = plaintext;
+ } else {
+ data = sc->data;
+ size = sc->size;
+ }
+
+ add = strlen(sc->id) + size;
+ if (add > left)
+ return -E2BIG;
+
+ r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
+ if (r < 0)
+ return r;
+
+ left -= add;
+ }
+
+ r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
+ if (r < 0)
+ return r;
+
+ /* After we created all keys with the right perms, also make sure the credential store as a whole is
+ * accessible */
+
+ if (uid_is_valid(uid) && uid != getuid()) {
+ r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ if (!ownership_ok)
+ return r;
+
+ if (fchown(dfd, uid, GID_INVALID) < 0)
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+static int setup_credentials_internal(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ const char *final, /* This is where the credential store shall eventually end up at */
+ const char *workspace, /* This is where we can prepare it before moving it to the final place */
+ bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
+ bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
+ uid_t uid) {
+
+ int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
+ * if we mounted something; false if we definitely can't mount anything */
+ bool final_mounted;
+ const char *where;
+
+ assert(context);
+ assert(final);
+ assert(workspace);
+
+ if (reuse_workspace) {
+ r = path_is_mount_point(workspace, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
+ else
+ workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
+ } else
+ workspace_mounted = -1; /* ditto */
+
+ r = path_is_mount_point(final, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /* If the final place already has something mounted, we use that. If the workspace also has
+ * something mounted we assume it's actually the same mount (but with MS_RDONLY
+ * different). */
+ final_mounted = true;
+
+ if (workspace_mounted < 0) {
+ /* If the final place is mounted, but the workspace we isn't, then let's bind mount
+ * the final version to the workspace, and make it writable, so that we can make
+ * changes */
+
+ r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+ if (r < 0)
+ return r;
+
+ workspace_mounted = true;
+ }
+ } else
+ final_mounted = false;
+
+ if (workspace_mounted < 0) {
+ /* Nothing is mounted on the workspace yet, let's try to mount something now */
+ for (int try = 0;; try++) {
+
+ if (try == 0) {
+ /* Try "ramfs" first, since it's not swap backed */
+ r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
+ if (r >= 0) {
+ workspace_mounted = true;
+ break;
+ }
+
+ } else if (try == 1) {
+ _cleanup_free_ char *opts = NULL;
+
+ if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
+ return -ENOMEM;
+
+ /* Fall back to "tmpfs" otherwise */
+ r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
+ if (r >= 0) {
+ workspace_mounted = true;
+ break;
+ }
+
+ } else {
+ /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
+ r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0) {
+ if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
+ return r;
+
+ if (must_mount) /* If we it's not OK to use the plain directory
+ * fallback, propagate all errors too */
+ return r;
+
+ /* If we lack privileges to bind mount stuff, then let's gracefully
+ * proceed for compat with container envs, and just use the final dir
+ * as is. */
+
+ workspace_mounted = false;
+ break;
+ }
+
+ /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+ if (r < 0)
+ return r;
+
+ workspace_mounted = true;
+ break;
+ }
+ }
+ }
+
+ assert(!must_mount || workspace_mounted > 0);
+ where = workspace_mounted ? workspace : final;
+
+ (void) label_fix_full(AT_FDCWD, where, final, 0);
+
+ r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
+ if (r < 0)
+ return r;
+
+ if (workspace_mounted) {
+ /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
+ if (r < 0)
+ return r;
+
+ /* And mount it to the final place, read-only */
+ if (final_mounted)
+ r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
+ else
+ r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ return r;
+ } else {
+ _cleanup_free_ char *parent = NULL;
+
+ /* If we do not have our own mount put used the plain directory fallback, then we need to
+ * open access to the top-level credential directory and the per-service directory now */
+
+ r = path_extract_directory(final, &parent);
+ if (r < 0)
+ return r;
+ if (chmod(parent, 0755) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int setup_credentials(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ uid_t uid) {
+
+ _cleanup_free_ char *p = NULL, *q = NULL;
+ int r;
+
+ assert(context);
+ assert(params);
+
+ if (!exec_context_has_credentials(context))
+ return 0;
+
+ if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
+ return -EINVAL;
+
+ /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
+ * and the subdir we mount over with a read-only file system readable by the service's user */
+ q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
+ if (!q)
+ return -ENOMEM;
+
+ r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ p = path_join(q, unit);
+ if (!p)
+ return -ENOMEM;
+
+ r = mkdir_label(p, 0700); /* per-unit dir: private to user */
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
+ if (r < 0) {
+ _cleanup_free_ char *t = NULL, *u = NULL;
+
+ /* If this is not a privilege or support issue then propagate the error */
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
+ * it into place, so that users can't access half-initialized credential stores. */
+ t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
+ if (!t)
+ return -ENOMEM;
+
+ /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
+ * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
+ * after it is fully set up */
+ u = path_join(t, unit);
+ if (!u)
+ return -ENOMEM;
+
+ FOREACH_STRING(i, t, u) {
+ r = mkdir_label(i, 0700);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ r = setup_credentials_internal(
+ context,
+ params,
+ unit,
+ p, /* final mount point */
+ u, /* temporary workspace to overmount */
+ true, /* reuse the workspace if it is already a mount */
+ false, /* it's OK to fall back to a plain directory if we can't mount anything */
+ uid);
+
+ (void) rmdir(u); /* remove the workspace again if we can. */
+
+ if (r < 0)
+ return r;
+
+ } else if (r == 0) {
+
+ /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
+ * we can use the same directory for all cases, after turning off propagation. Question
+ * though is: where do we turn off propagation exactly, and where do we place the workspace
+ * directory? We need some place that is guaranteed to be a mount point in the host, and
+ * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
+ * since we ultimately want to move the resulting file system there, i.e. we need propagation
+ * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
+ * would be visible in the host mount table all the time, which we want to avoid. Hence, what
+ * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
+ * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
+ * propagation on the former, and then overmount the latter.
+ *
+ * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
+ * for this purpose, but there are few other candidates that work equally well for us, and
+ * given that the we do this in a privately namespaced short-lived single-threaded process
+ * that no one else sees this should be OK to do. */
+
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
+ if (r < 0)
+ goto child_fail;
+
+ r = setup_credentials_internal(
+ context,
+ params,
+ unit,
+ p, /* final mount point */
+ "/dev/shm", /* temporary workspace to overmount */
+ false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
+ true, /* insist that something is mounted, do not allow fallback to plain directory */
+ uid);
+ if (r < 0)
+ goto child_fail;
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ _exit(EXIT_FAILURE);
+ }
+
+ return 0;
+}
+
+#if ENABLE_SMACK
+static int setup_smack(
+ const Manager *manager,
+ const ExecContext *context,
+ int executable_fd) {
+ int r;
+
+ assert(context);
+ assert(executable_fd >= 0);
+
+ if (context->smack_process_label) {
+ r = mac_smack_apply_pid(0, context->smack_process_label);
+ if (r < 0)
+ return r;
+ } else if (manager->default_smack_process_label) {
+ _cleanup_free_ char *exec_label = NULL;
+
+ r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+#endif
+
+static int compile_bind_mounts(
+ const ExecContext *context,
+ const ExecParameters *params,
+ BindMount **ret_bind_mounts,
+ size_t *ret_n_bind_mounts,
+ char ***ret_empty_directories) {
+
+ _cleanup_strv_free_ char **empty_directories = NULL;
+ BindMount *bind_mounts;
+ size_t n, h = 0;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_bind_mounts);
+ assert(ret_n_bind_mounts);
+ assert(ret_empty_directories);
+
+ n = context->n_bind_mounts;
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!params->prefix[t])
+ continue;
+
+ for (size_t i = 0; i < context->directories[t].n_items; i++)
+ n += !context->directories[t].items[i].only_create;
+ }
+
+ if (n <= 0) {
+ *ret_bind_mounts = NULL;
+ *ret_n_bind_mounts = 0;
+ *ret_empty_directories = NULL;
+ return 0;
+ }
+
+ bind_mounts = new(BindMount, n);
+ if (!bind_mounts)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < context->n_bind_mounts; i++) {
+ BindMount *item = context->bind_mounts + i;
+ char *s, *d;
+
+ s = strdup(item->source);
+ if (!s) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ d = strdup(item->destination);
+ if (!d) {
+ free(s);
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ bind_mounts[h++] = (BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = item->read_only,
+ .recursive = item->recursive,
+ .ignore_enoent = item->ignore_enoent,
+ };
+ }
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!params->prefix[t])
+ continue;
+
+ if (context->directories[t].n_items == 0)
+ continue;
+
+ if (exec_directory_is_private(context, t) &&
+ !exec_context_with_rootfs(context)) {
+ char *private_root;
+
+ /* So this is for a dynamic user, and we need to make sure the process can access its own
+ * directory. For that we overmount the usually inaccessible "private" subdirectory with a
+ * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
+
+ private_root = path_join(params->prefix[t], "private");
+ if (!private_root) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = strv_consume(&empty_directories, private_root);
+ if (r < 0)
+ goto finish;
+ }
+
+ for (size_t i = 0; i < context->directories[t].n_items; i++) {
+ char *s, *d;
+
+ /* When one of the parent directories is in the list, we cannot create the symlink
+ * for the child directory. See also the comments in setup_exec_directory(). */
+ if (context->directories[t].items[i].only_create)
+ continue;
+
+ if (exec_directory_is_private(context, t))
+ s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
+ else
+ s = path_join(params->prefix[t], context->directories[t].items[i].path);
+ if (!s) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ if (exec_directory_is_private(context, t) &&
+ exec_context_with_rootfs(context))
+ /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
+ * directory is not created on the root directory. So, let's bind-mount the directory
+ * on the 'non-private' place. */
+ d = path_join(params->prefix[t], context->directories[t].items[i].path);
+ else
+ d = strdup(s);
+ if (!d) {
+ free(s);
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ bind_mounts[h++] = (BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = false,
+ .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
+ .recursive = true,
+ .ignore_enoent = false,
+ };
+ }
+ }
+
+ assert(h == n);
+
+ *ret_bind_mounts = bind_mounts;
+ *ret_n_bind_mounts = n;
+ *ret_empty_directories = TAKE_PTR(empty_directories);
+
+ return (int) n;
+
+finish:
+ bind_mount_free_many(bind_mounts, h);
+ return r;
+}
+
+/* ret_symlinks will contain a list of pairs src:dest that describes
+ * the symlinks to create later on. For example, the symlinks needed
+ * to safely give private directories to DynamicUser=1 users. */
+static int compile_symlinks(
+ const ExecContext *context,
+ const ExecParameters *params,
+ char ***ret_symlinks) {
+
+ _cleanup_strv_free_ char **symlinks = NULL;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_symlinks);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ for (size_t i = 0; i < context->directories[dt].n_items; i++) {
+ _cleanup_free_ char *private_path = NULL, *path = NULL;
+
+ STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
+ _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+ src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ dst_abs = path_join(params->prefix[dt], *symlink);
+ if (!src_abs || !dst_abs)
+ return -ENOMEM;
+
+ r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
+ if (r < 0)
+ return r;
+ }
+
+ if (!exec_directory_is_private(context, dt) ||
+ exec_context_with_rootfs(context) ||
+ context->directories[dt].items[i].only_create)
+ continue;
+
+ private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
+ if (!private_path)
+ return -ENOMEM;
+
+ path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ if (!path)
+ return -ENOMEM;
+
+ r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ *ret_symlinks = TAKE_PTR(symlinks);
+
+ return 0;
+}
+
+static bool insist_on_sandboxing(
+ const ExecContext *context,
+ const char *root_dir,
+ const char *root_image,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts) {
+
+ assert(context);
+ assert(n_bind_mounts == 0 || bind_mounts);
+
+ /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
+ * would alter the view on the file system beyond making things read-only or invisible, i.e. would
+ * rearrange stuff in a way we cannot ignore gracefully. */
+
+ if (context->n_temporary_filesystems > 0)
+ return true;
+
+ if (root_dir || root_image)
+ return true;
+
+ if (context->n_mount_images > 0)
+ return true;
+
+ if (context->dynamic_user)
+ return true;
+
+ if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
+ return true;
+
+ /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
+ * essential. */
+ for (size_t i = 0; i < n_bind_mounts; i++)
+ if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
+ return true;
+
+ if (context->log_namespace)
+ return true;
+
+ return false;
+}
+
+static int apply_mount_namespace(
+ const Unit *u,
+ ExecCommandFlags command_flags,
+ const ExecContext *context,
+ const ExecParameters *params,
+ const ExecRuntime *runtime,
+ char **error_path) {
+
+ _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
+ const char *tmp_dir = NULL, *var_tmp_dir = NULL;
+ const char *root_dir = NULL, *root_image = NULL;
+ _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
+ *extension_dir = NULL;
+ NamespaceInfo ns_info;
+ bool needs_sandboxing;
+ BindMount *bind_mounts = NULL;
+ size_t n_bind_mounts = 0;
+ int r;
+
+ assert(context);
+
+ if (params->flags & EXEC_APPLY_CHROOT) {
+ root_image = context->root_image;
+
+ if (!root_image)
+ root_dir = context->root_directory;
+ }
+
+ r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
+ if (r < 0)
+ return r;
+
+ /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
+ r = compile_symlinks(context, params, &symlinks);
+ if (r < 0)
+ goto finalize;
+
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+ if (needs_sandboxing) {
+ /* The runtime struct only contains the parent of the private /tmp,
+ * which is non-accessible to world users. Inside of it there's a /tmp
+ * that is sticky, and that's the one we want to use here.
+ * This does not apply when we are using /run/systemd/empty as fallback. */
+
+ if (context->private_tmp && runtime) {
+ if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
+ tmp_dir = runtime->tmp_dir;
+ else if (runtime->tmp_dir)
+ tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
+
+ if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
+ var_tmp_dir = runtime->var_tmp_dir;
+ else if (runtime->var_tmp_dir)
+ var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
+ }
+
+ ns_info = (NamespaceInfo) {
+ .ignore_protect_paths = false,
+ .private_dev = context->private_devices,
+ .protect_control_groups = context->protect_control_groups,
+ .protect_kernel_tunables = context->protect_kernel_tunables,
+ .protect_kernel_modules = context->protect_kernel_modules,
+ .protect_kernel_logs = context->protect_kernel_logs,
+ .protect_hostname = context->protect_hostname,
+ .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
+ .private_mounts = context->private_mounts,
+ .protect_home = context->protect_home,
+ .protect_system = context->protect_system,
+ .protect_proc = context->protect_proc,
+ .proc_subset = context->proc_subset,
+ .private_ipc = context->private_ipc || context->ipc_namespace_path,
+ /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
+ .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
+ };
+ } else if (!context->dynamic_user && root_dir)
+ /*
+ * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
+ * sandbox info, otherwise enforce it, don't ignore protected paths and
+ * fail if we are enable to apply the sandbox inside the mount namespace.
+ */
+ ns_info = (NamespaceInfo) {
+ .ignore_protect_paths = true,
+ };
+ else
+ ns_info = (NamespaceInfo) {};
+
+ if (context->mount_flags == MS_SHARED)
+ log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
+
+ if (exec_context_has_credentials(context) &&
+ params->prefix[EXEC_DIRECTORY_RUNTIME] &&
+ FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+ creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
+ if (!creds_path) {
+ r = -ENOMEM;
+ goto finalize;
+ }
+ }
+
+ if (MANAGER_IS_SYSTEM(u->manager)) {
+ propagate_dir = path_join("/run/systemd/propagate/", u->id);
+ if (!propagate_dir) {
+ r = -ENOMEM;
+ goto finalize;
+ }
+
+ incoming_dir = strdup("/run/systemd/incoming");
+ if (!incoming_dir) {
+ r = -ENOMEM;
+ goto finalize;
+ }
+
+ extension_dir = strdup("/run/systemd/unit-extensions");
+ if (!extension_dir) {
+ r = -ENOMEM;
+ goto finalize;
+ }
+ } else
+ if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
+ r = -ENOMEM;
+ goto finalize;
+ }
+
+ r = setup_namespace(root_dir, root_image, context->root_image_options,
+ &ns_info, context->read_write_paths,
+ needs_sandboxing ? context->read_only_paths : NULL,
+ needs_sandboxing ? context->inaccessible_paths : NULL,
+ needs_sandboxing ? context->exec_paths : NULL,
+ needs_sandboxing ? context->no_exec_paths : NULL,
+ empty_directories,
+ symlinks,
+ bind_mounts,
+ n_bind_mounts,
+ context->temporary_filesystems,
+ context->n_temporary_filesystems,
+ context->mount_images,
+ context->n_mount_images,
+ tmp_dir,
+ var_tmp_dir,
+ creds_path,
+ context->log_namespace,
+ context->mount_flags,
+ context->root_hash, context->root_hash_size, context->root_hash_path,
+ context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
+ context->root_verity,
+ context->extension_images,
+ context->n_extension_images,
+ context->extension_directories,
+ propagate_dir,
+ incoming_dir,
+ extension_dir,
+ root_dir || root_image ? params->notify_socket : NULL,
+ error_path);
+
+ /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
+ * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
+ * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
+ * completely different execution environment. */
+ if (r == -ENOANO) {
+ if (insist_on_sandboxing(
+ context,
+ root_dir, root_image,
+ bind_mounts,
+ n_bind_mounts)) {
+ log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
+ "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+ n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
+
+ r = -EOPNOTSUPP;
+ } else {
+ log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
+ r = 0;
+ }
+ }
+
+finalize:
+ bind_mount_free_many(bind_mounts, n_bind_mounts);
+ return r;
+}
+
+static int apply_working_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *home,
+ int *exit_status) {
+
+ const char *d, *wd;
+
+ assert(context);
+ assert(exit_status);
+
+ if (context->working_directory_home) {
+
+ if (!home) {
+ *exit_status = EXIT_CHDIR;
+ return -ENXIO;
+ }
+
+ wd = home;
+
+ } else
+ wd = empty_to_root(context->working_directory);
+
+ if (params->flags & EXEC_APPLY_CHROOT)
+ d = wd;
+ else
+ d = prefix_roota(context->root_directory, wd);
+
+ if (chdir(d) < 0 && !context->working_directory_missing_ok) {
+ *exit_status = EXIT_CHDIR;
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int apply_root_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const bool needs_mount_ns,
+ int *exit_status) {
+
+ assert(context);
+ assert(exit_status);
+
+ if (params->flags & EXEC_APPLY_CHROOT)
+ if (!needs_mount_ns && context->root_directory)
+ if (chroot(context->root_directory) < 0) {
+ *exit_status = EXIT_CHROOT;
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int setup_keyring(
+ const Unit *u,
+ const ExecContext *context,
+ const ExecParameters *p,
+ uid_t uid, gid_t gid) {
+
+ key_serial_t keyring;
+ int r = 0;
+ uid_t saved_uid;
+ gid_t saved_gid;
+
+ assert(u);
+ assert(context);
+ assert(p);
+
+ /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
+ * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
+ * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
+ * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
+ * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
+ * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
+
+ if (context->keyring_mode == EXEC_KEYRING_INHERIT)
+ return 0;
+
+ /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
+ * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
+ * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
+ * & group is just as nasty as acquiring a reference to the user keyring. */
+
+ saved_uid = getuid();
+ saved_gid = getgid();
+
+ if (gid_is_valid(gid) && gid != saved_gid) {
+ if (setregid(gid, -1) < 0)
+ return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
+ }
+
+ if (uid_is_valid(uid) && uid != saved_uid) {
+ if (setreuid(uid, -1) < 0) {
+ r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
+ goto out;
+ }
+ }
+
+ keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
+ if (keyring == -1) {
+ if (errno == ENOSYS)
+ log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
+ else if (ERRNO_IS_PRIVILEGE(errno))
+ log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
+ else if (errno == EDQUOT)
+ log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
+ else
+ r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
+
+ goto out;
+ }
+
+ /* When requested link the user keyring into the session keyring. */
+ if (context->keyring_mode == EXEC_KEYRING_SHARED) {
+
+ if (keyctl(KEYCTL_LINK,
+ KEY_SPEC_USER_KEYRING,
+ KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
+ r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
+ goto out;
+ }
+ }
+
+ /* Restore uid/gid back */
+ if (uid_is_valid(uid) && uid != saved_uid) {
+ if (setreuid(saved_uid, -1) < 0) {
+ r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
+ goto out;
+ }
+ }
+
+ if (gid_is_valid(gid) && gid != saved_gid) {
+ if (setregid(saved_gid, -1) < 0)
+ return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
+ }
+
+ /* Populate they keyring with the invocation ID by default, as original saved_uid. */
+ if (!sd_id128_is_null(u->invocation_id)) {
+ key_serial_t key;
+
+ key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
+ if (key == -1)
+ log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
+ else {
+ if (keyctl(KEYCTL_SETPERM, key,
+ KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
+ KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
+ r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
+ }
+ }
+
+out:
+ /* Revert back uid & gid for the last time, and exit */
+ /* no extra logging, as only the first already reported error matters */
+ if (getuid() != saved_uid)
+ (void) setreuid(saved_uid, -1);
+
+ if (getgid() != saved_gid)
+ (void) setregid(saved_gid, -1);
+
+ return r;
+}
+
+static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
+ assert(array);
+ assert(n);
+ assert(pair);
+
+ if (pair[0] >= 0)
+ array[(*n)++] = pair[0];
+ if (pair[1] >= 0)
+ array[(*n)++] = pair[1];
+}
+
+static int close_remaining_fds(
+ const ExecParameters *params,
+ const ExecRuntime *runtime,
+ const DynamicCreds *dcreds,
+ int user_lookup_fd,
+ int socket_fd,
+ const int *fds, size_t n_fds) {
+
+ size_t n_dont_close = 0;
+ int dont_close[n_fds + 12];
+
+ assert(params);
+
+ if (params->stdin_fd >= 0)
+ dont_close[n_dont_close++] = params->stdin_fd;
+ if (params->stdout_fd >= 0)
+ dont_close[n_dont_close++] = params->stdout_fd;
+ if (params->stderr_fd >= 0)
+ dont_close[n_dont_close++] = params->stderr_fd;
+
+ if (socket_fd >= 0)
+ dont_close[n_dont_close++] = socket_fd;
+ if (n_fds > 0) {
+ memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
+ n_dont_close += n_fds;
+ }
+
+ if (runtime) {
+ append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
+ append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
+ }
+
+ if (dcreds) {
+ if (dcreds->user)
+ append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
+ if (dcreds->group)
+ append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
+ }
+
+ if (user_lookup_fd >= 0)
+ dont_close[n_dont_close++] = user_lookup_fd;
+
+ return close_all_fds(dont_close, n_dont_close);
+}
+
+static int send_user_lookup(
+ Unit *unit,
+ int user_lookup_fd,
+ uid_t uid,
+ gid_t gid) {
+
+ assert(unit);
+
+ /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
+ * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
+ * specified. */
+
+ if (user_lookup_fd < 0)
+ return 0;
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid))
+ return 0;
+
+ if (writev(user_lookup_fd,
+ (struct iovec[]) {
+ IOVEC_INIT(&uid, sizeof(uid)),
+ IOVEC_INIT(&gid, sizeof(gid)),
+ IOVEC_INIT_STRING(unit->id) }, 3) < 0)
+ return -errno;
+
+ return 0;
+}
+
+static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
+ int r;
+
+ assert(c);
+ assert(home);
+ assert(buf);
+
+ /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
+
+ if (*home)
+ return 0;
+
+ if (!c->working_directory_home)
+ return 0;
+
+ r = get_home_dir(buf);
+ if (r < 0)
+ return r;
+
+ *home = *buf;
+ return 1;
+}
+
+static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
+ _cleanup_strv_free_ char ** list = NULL;
+ int r;
+
+ assert(c);
+ assert(p);
+ assert(ret);
+
+ assert(c->dynamic_user);
+
+ /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
+ * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
+ * directories. */
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (t == EXEC_DIRECTORY_CONFIGURATION)
+ continue;
+
+ if (!p->prefix[t])
+ continue;
+
+ for (size_t i = 0; i < c->directories[t].n_items; i++) {
+ char *e;
+
+ if (exec_directory_is_private(c, t))
+ e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
+ else
+ e = path_join(p->prefix[t], c->directories[t].items[i].path);
+ if (!e)
+ return -ENOMEM;
+
+ r = strv_consume(&list, e);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ *ret = TAKE_PTR(list);
+
+ return 0;
+}
+
+static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
+ bool using_subcgroup;
+ char *p;
+
+ assert(params);
+ assert(ret);
+
+ if (!params->cgroup_path)
+ return -EINVAL;
+
+ /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
+ * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
+ * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
+ * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
+ * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
+ * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
+ * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
+ * flag, which is only passed for the former statements, not for the latter. */
+
+ using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
+ if (using_subcgroup)
+ p = path_join(params->cgroup_path, ".control");
+ else
+ p = strdup(params->cgroup_path);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return using_subcgroup;
+}
+
+static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ if (!c->numa_policy.nodes.set) {
+ log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
+ return 0;
+ }
+
+ r = numa_to_cpu_set(&c->numa_policy, &s);
+ if (r < 0)
+ return r;
+
+ cpu_set_reset(ret);
+
+ return cpu_set_add_all(ret, &s);
+}
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
+ assert(c);
+
+ return c->cpu_affinity_from_numa;
+}
+
+static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
+ int r;
+
+ assert(fds);
+ assert(n_fds);
+ assert(*n_fds < fds_size);
+ assert(ret_fd);
+
+ if (fd < 0) {
+ *ret_fd = -1;
+ return 0;
+ }
+
+ if (fd < 3 + (int) *n_fds) {
+ /* Let's move the fd up, so that it's outside of the fd range we will use to store
+ * the fds we pass to the process (or which are closed only during execve). */
+
+ r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
+ if (r < 0)
+ return -errno;
+
+ close_and_replace(fd, r);
+ }
+
+ *ret_fd = fds[*n_fds] = fd;
+ (*n_fds) ++;
+ return 1;
+}
+
+static int exec_child(
+ Unit *unit,
+ const ExecCommand *command,
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecRuntime *runtime,
+ DynamicCreds *dcreds,
+ int socket_fd,
+ const int named_iofds[static 3],
+ int *fds,
+ size_t n_socket_fds,
+ size_t n_storage_fds,
+ char **files_env,
+ int user_lookup_fd,
+ int *exit_status) {
+
+ _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
+ int r, ngids = 0, exec_fd;
+ _cleanup_free_ gid_t *supplementary_gids = NULL;
+ const char *username = NULL, *groupname = NULL;
+ _cleanup_free_ char *home_buffer = NULL;
+ const char *home = NULL, *shell = NULL;
+ char **final_argv = NULL;
+ dev_t journal_stream_dev = 0;
+ ino_t journal_stream_ino = 0;
+ bool userns_set_up = false;
+ bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
+ needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
+ needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
+ needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
+#if HAVE_SELINUX
+ _cleanup_free_ char *mac_selinux_context_net = NULL;
+ bool use_selinux = false;
+#endif
+#if ENABLE_SMACK
+ bool use_smack = false;
+#endif
+#if HAVE_APPARMOR
+ bool use_apparmor = false;
+#endif
+ uid_t saved_uid = getuid();
+ gid_t saved_gid = getgid();
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+ size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
+ n_keep_fds; /* total number of fds not to close */
+ int secure_bits;
+ _cleanup_free_ gid_t *gids_after_pam = NULL;
+ int ngids_after_pam = 0;
+
+ assert(unit);
+ assert(command);
+ assert(context);
+ assert(params);
+ assert(exit_status);
+
+ /* Explicitly test for CVE-2021-4034 inspired invocations */
+ assert(command->path);
+ assert(!strv_isempty(command->argv));
+
+ rename_process_from_path(command->path);
+
+ /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
+ * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
+ * both of which will be demoted to SIG_DFL. */
+ (void) default_signals(SIGNALS_CRASH_HANDLER,
+ SIGNALS_IGNORE);
+
+ if (context->ignore_sigpipe)
+ (void) ignore_signals(SIGPIPE);
+
+ r = reset_signal_mask();
+ if (r < 0) {
+ *exit_status = EXIT_SIGNAL_MASK;
+ return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
+ }
+
+ if (params->idle_pipe)
+ do_idle_pipe_dance(params->idle_pipe);
+
+ /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
+ * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
+ * any fds open we don't really want open during the transition. In order to make logging work, we switch the
+ * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
+
+ log_forget_fds();
+ log_set_open_when_needed(true);
+ log_settle_target();
+ if (context->log_level_max >= 0)
+ log_set_max_level(context->log_level_max);
+
+ /* In case anything used libc syslog(), close this here, too */
+ closelog();
+
+ int keep_fds[n_fds + 3];
+ memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
+ n_keep_fds = n_fds;
+
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
+ }
+
+#if HAVE_LIBBPF
+ if (unit->manager->restrict_fs) {
+ int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
+ if (bpf_map_fd < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
+ }
+
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
+ }
+ }
+#endif
+
+ r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
+ }
+
+ if (!context->same_pgrp &&
+ setsid() < 0) {
+ *exit_status = EXIT_SETSID;
+ return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
+ }
+
+ exec_context_tty_reset(context, params);
+
+ if (unit_shall_confirm_spawn(unit)) {
+ _cleanup_free_ char *cmdline = NULL;
+
+ cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
+ if (!cmdline) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
+ if (r != CONFIRM_EXECUTE) {
+ if (r == CONFIRM_PRETEND_SUCCESS) {
+ *exit_status = EXIT_SUCCESS;
+ return 0;
+ }
+
+ *exit_status = EXIT_CONFIRM;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
+ "Execution cancelled by the user");
+ }
+ }
+
+ /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
+ * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
+ * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
+ * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
+ * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
+ if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
+ setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
+ }
+
+ if (context->dynamic_user && dcreds) {
+ _cleanup_strv_free_ char **suggested_paths = NULL;
+
+ /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
+ * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
+ if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
+ }
+
+ r = compile_suggested_paths(context, params, &suggested_paths);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ if (r == -EILSEQ)
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Failed to update dynamic user credentials: User or group with specified name already exists.");
+ return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
+ }
+
+ if (!uid_is_valid(uid)) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
+ }
+
+ if (!gid_is_valid(gid)) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
+ }
+
+ if (dcreds->user)
+ username = dcreds->user->name;
+
+ } else {
+ r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
+ }
+
+ r = get_fixed_group(context, &groupname, &gid);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
+ }
+ }
+
+ /* Initialize user supplementary groups and get SupplementaryGroups= ones */
+ r = get_supplementary_groups(context, username, groupname, gid,
+ &supplementary_gids, &ngids);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
+ }
+
+ r = send_user_lookup(unit, user_lookup_fd, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
+ }
+
+ user_lookup_fd = safe_close(user_lookup_fd);
+
+ r = acquire_home(context, uid, &home, &home_buffer);
+ if (r < 0) {
+ *exit_status = EXIT_CHDIR;
+ return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
+ }
+
+ /* If a socket is connected to STDIN/STDOUT/STDERR, we
+ * must sure to drop O_NONBLOCK */
+ if (socket_fd >= 0)
+ (void) fd_nonblock(socket_fd, false);
+
+ /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
+ * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
+ if (params->cgroup_path) {
+ _cleanup_free_ char *p = NULL;
+
+ r = exec_parameters_get_cgroup_path(params, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+ }
+
+ r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
+ if (r == -EUCLEAN) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
+ "because the cgroup or one of its parents or "
+ "siblings is in the threaded mode: %m", p);
+ }
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
+ }
+ }
+
+ if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
+ if (r < 0) {
+ *exit_status = EXIT_NETWORK;
+ return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
+ }
+ }
+
+ if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
+ }
+ }
+
+ r = setup_input(context, params, socket_fd, named_iofds);
+ if (r < 0) {
+ *exit_status = EXIT_STDIN;
+ return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
+ }
+
+ r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ if (r < 0) {
+ *exit_status = EXIT_STDOUT;
+ return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
+ }
+
+ r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ if (r < 0) {
+ *exit_status = EXIT_STDERR;
+ return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
+ }
+
+ if (context->oom_score_adjust_set) {
+ /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
+ * prohibit write access to this file, and we shouldn't trip up over that. */
+ r = set_oom_score_adjust(context->oom_score_adjust);
+ if (ERRNO_IS_PRIVILEGE(r))
+ log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_OOM_ADJUST;
+ return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
+ }
+ }
+
+ if (context->coredump_filter_set) {
+ r = set_coredump_filter(context->coredump_filter);
+ if (ERRNO_IS_PRIVILEGE(r))
+ log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
+ }
+ }
+
+ if (context->nice_set) {
+ r = setpriority_closest(context->nice);
+ if (r < 0) {
+ *exit_status = EXIT_NICE;
+ return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
+ }
+ }
+
+ if (context->cpu_sched_set) {
+ struct sched_param param = {
+ .sched_priority = context->cpu_sched_priority,
+ };
+
+ r = sched_setscheduler(0,
+ context->cpu_sched_policy |
+ (context->cpu_sched_reset_on_fork ?
+ SCHED_RESET_ON_FORK : 0),
+ &param);
+ if (r < 0) {
+ *exit_status = EXIT_SETSCHEDULER;
+ return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
+ }
+ }
+
+ if (context->cpu_affinity_from_numa || context->cpu_set.set) {
+ _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
+ const CPUSet *cpu_set;
+
+ if (context->cpu_affinity_from_numa) {
+ r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
+ if (r < 0) {
+ *exit_status = EXIT_CPUAFFINITY;
+ return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
+ }
+
+ cpu_set = &converted_cpu_set;
+ } else
+ cpu_set = &context->cpu_set;
+
+ if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
+ *exit_status = EXIT_CPUAFFINITY;
+ return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
+ }
+ }
+
+ if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
+ r = apply_numa_policy(&context->numa_policy);
+ if (r == -EOPNOTSUPP)
+ log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
+ else if (r < 0) {
+ *exit_status = EXIT_NUMA_POLICY;
+ return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+ }
+ }
+
+ if (context->ioprio_set)
+ if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
+ *exit_status = EXIT_IOPRIO;
+ return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
+ }
+
+ if (context->timer_slack_nsec != NSEC_INFINITY)
+ if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
+ *exit_status = EXIT_TIMERSLACK;
+ return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
+ }
+
+ if (context->personality != PERSONALITY_INVALID) {
+ r = safe_personality(context->personality);
+ if (r < 0) {
+ *exit_status = EXIT_PERSONALITY;
+ return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
+ }
+ }
+
+ if (context->utmp_id) {
+ const char *line = context->tty_path ?
+ (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
+ NULL;
+ utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
+ line,
+ context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
+ context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
+ USER_PROCESS,
+ username);
+ }
+
+ if (uid_is_valid(uid)) {
+ r = chown_terminal(STDIN_FILENO, uid);
+ if (r < 0) {
+ *exit_status = EXIT_STDIN;
+ return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
+ }
+ }
+
+ /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
+ * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+ * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+ * touch a single hierarchy too. */
+ if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
+ }
+ }
+
+ needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
+ }
+
+ if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+ r = setup_credentials(context, params, unit->id, uid);
+ if (r < 0) {
+ *exit_status = EXIT_CREDENTIALS;
+ return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
+ }
+ }
+
+ r = build_environment(
+ unit,
+ context,
+ params,
+ n_fds,
+ home,
+ username,
+ shell,
+ journal_stream_dev,
+ journal_stream_ino,
+ &our_env);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = build_pass_environment(context, &pass_env);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ /* The $PATH variable is set to the default path in params->environment. However, this is overridden
+ * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
+ * not specify PATH but the unit has ExecSearchPath. */
+ if (!strv_isempty(context->exec_search_path)) {
+ _cleanup_free_ char *joined = NULL;
+
+ joined = strv_join(context->exec_search_path, ":");
+ if (!joined) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ }
+
+ accum_env = strv_env_merge(params->environment,
+ our_env,
+ joined_exec_search_path,
+ pass_env,
+ context->environment,
+ files_env);
+ if (!accum_env) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ accum_env = strv_env_clean(accum_env);
+
+ (void) umask(context->umask);
+
+ r = setup_keyring(unit, context, params, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_KEYRING;
+ return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
+ }
+
+ /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+ * from it. */
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
+ /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
+ * for it, and the kernel doesn't actually support ambient caps. */
+ needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
+
+ /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
+ * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
+ * desired. */
+ if (needs_ambient_hack)
+ needs_setuid = false;
+ else
+ needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
+
+ if (needs_sandboxing) {
+ /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
+ * /sys being present. The actual MAC context application will happen later, as late as
+ * possible, to avoid impacting our own code paths. */
+
+#if HAVE_SELINUX
+ use_selinux = mac_selinux_use();
+#endif
+#if ENABLE_SMACK
+ use_smack = mac_smack_use();
+#endif
+#if HAVE_APPARMOR
+ use_apparmor = mac_apparmor_use();
+#endif
+ }
+
+ if (needs_sandboxing) {
+ int which_failed;
+
+ /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
+ * is set here. (See below.) */
+
+ r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
+ if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+ }
+ }
+
+ if (needs_setuid && context->pam_name && username) {
+ /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
+ * wins here. (See above.) */
+
+ /* All fds passed in the fds array will be closed in the pam child process. */
+ r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
+ if (r < 0) {
+ *exit_status = EXIT_PAM;
+ return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
+ }
+
+ ngids_after_pam = getgroups_alloc(&gids_after_pam);
+ if (ngids_after_pam < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
+ }
+ }
+
+ if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
+ /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
+ * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
+ * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
+
+ userns_set_up = true;
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
+ }
+ }
+
+ if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
+
+ if (ns_type_supported(NAMESPACE_NET)) {
+ r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
+ if (r == -EPERM)
+ log_unit_warning_errno(unit, r,
+ "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_NETWORK;
+ return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
+ }
+ } else if (context->network_namespace_path) {
+ *exit_status = EXIT_NETWORK;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "NetworkNamespacePath= is not supported, refusing.");
+ } else
+ log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
+ }
+
+ if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
+
+ if (ns_type_supported(NAMESPACE_IPC)) {
+ r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
+ if (r == -EPERM)
+ log_unit_warning_errno(unit, r,
+ "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
+ }
+ } else if (context->ipc_namespace_path) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "IPCNamespacePath= is not supported, refusing.");
+ } else
+ log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
+ }
+
+ if (needs_mount_namespace) {
+ _cleanup_free_ char *error_path = NULL;
+
+ r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
+ error_path ? ": " : "", strempty(error_path));
+ }
+ }
+
+ if (needs_sandboxing) {
+ r = apply_protect_hostname(unit, context, exit_status);
+ if (r < 0)
+ return r;
+ }
+
+ /* Drop groups as early as possible.
+ * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
+ * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
+ if (needs_setuid) {
+ _cleanup_free_ gid_t *gids_to_enforce = NULL;
+ int ngids_to_enforce = 0;
+
+ ngids_to_enforce = merge_gid_lists(supplementary_gids,
+ ngids,
+ gids_after_pam,
+ ngids_after_pam,
+ &gids_to_enforce);
+ if (ngids_to_enforce < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_unit_error_errno(unit,
+ ngids_to_enforce,
+ "Failed to merge group lists. Group membership might be incorrect: %m");
+ }
+
+ r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
+ }
+ }
+
+ /* If the user namespace was not set up above, try to do it now.
+ * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
+ * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
+ * case of mount namespaces being less privileged when the mount point list is copied from a
+ * different user namespace). */
+
+ if (needs_sandboxing && context->private_users && !userns_set_up) {
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
+ }
+ }
+
+ /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
+ * shall execute. */
+
+ _cleanup_free_ char *executable = NULL;
+ _cleanup_close_ int executable_fd = -1;
+ r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
+ if (r < 0) {
+ if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ log_unit_struct_errno(unit, LOG_INFO, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_UNIT_INVOCATION_ID(unit),
+ LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
+ command->path),
+ "EXECUTABLE=%s", command->path);
+ *exit_status = EXIT_SUCCESS;
+ return 0;
+ }
+
+ *exit_status = EXIT_EXEC;
+ return log_unit_struct_errno(unit, LOG_INFO, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_UNIT_INVOCATION_ID(unit),
+ LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
+ command->path),
+ "EXECUTABLE=%s", command->path);
+ }
+
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
+ }
+
+#if HAVE_SELINUX
+ if (needs_sandboxing && use_selinux && params->selinux_context_net) {
+ int fd = -1;
+
+ if (socket_fd >= 0)
+ fd = socket_fd;
+ else if (params->n_socket_fds == 1)
+ /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
+ * use context from that fd to compute the label. */
+ fd = params->fds[0];
+
+ if (fd >= 0) {
+ r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
+ if (r < 0) {
+ if (!context->selinux_context_ignore) {
+ *exit_status = EXIT_SELINUX_CONTEXT;
+ return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
+ }
+ log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
+ }
+ }
+ }
+#endif
+
+ /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
+ * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
+ * however if we have it as we want to keep it open until the final execve(). */
+
+ r = close_all_fds(keep_fds, n_keep_fds);
+ if (r >= 0)
+ r = shift_fds(fds, n_fds);
+ if (r >= 0)
+ r = flag_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
+ }
+
+ /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
+ * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
+ * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
+ * came this far. */
+
+ secure_bits = context->secure_bits;
+
+ if (needs_sandboxing) {
+ uint64_t bset;
+
+ /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
+ * requested. (Note this is placed after the general resource limit initialization, see
+ * above, in order to take precedence.) */
+ if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
+ if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
+ }
+ }
+
+#if ENABLE_SMACK
+ /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
+ * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
+ if (use_smack) {
+ r = setup_smack(unit->manager, context, executable_fd);
+ if (r < 0 && !context->smack_process_label_ignore) {
+ *exit_status = EXIT_SMACK_PROCESS_LABEL;
+ return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
+ }
+ }
+#endif
+
+ bset = context->capability_bounding_set;
+ /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
+ * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
+ * instead of us doing that */
+ if (needs_ambient_hack)
+ bset |= (UINT64_C(1) << CAP_SETPCAP) |
+ (UINT64_C(1) << CAP_SETUID) |
+ (UINT64_C(1) << CAP_SETGID);
+
+ if (!cap_test_all(bset)) {
+ r = capability_bounding_set_drop(bset, false);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
+ }
+ }
+
+ /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
+ * keep-caps set.
+ * To be able to raise the ambient capabilities after setresuid() they have to be
+ * added to the inherited set and keep caps has to be set (done in enforce_user()).
+ * After setresuid() the ambient capabilities can be raised as they are present in
+ * the permitted and inhertiable set. However it is possible that someone wants to
+ * set ambient capabilities without changing the user, so we also set the ambient
+ * capabilities here.
+ * The requested ambient capabilities are raised in the inheritable set if the
+ * second argument is true. */
+ if (!needs_ambient_hack) {
+ r = capability_ambient_set_apply(context->capability_ambient_set, true);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
+ }
+ }
+ }
+
+ /* chroot to root directory first, before we lose the ability to chroot */
+ r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
+
+ if (needs_setuid) {
+ if (uid_is_valid(uid)) {
+ r = enforce_user(context, uid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
+ }
+
+ if (!needs_ambient_hack &&
+ context->capability_ambient_set != 0) {
+
+ /* Raise the ambient capabilities after user change. */
+ r = capability_ambient_set_apply(context->capability_ambient_set, false);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
+ }
+ }
+ }
+ }
+
+ /* Apply working directory here, because the working directory might be on NFS and only the user running
+ * this service might have the correct privilege to change to the working directory */
+ r = apply_working_directory(context, params, home, exit_status);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
+
+ if (needs_sandboxing) {
+ /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
+ * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
+ * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
+ * are restricted. */
+
+#if HAVE_SELINUX
+ if (use_selinux) {
+ char *exec_context = mac_selinux_context_net ?: context->selinux_context;
+
+ if (exec_context) {
+ r = setexeccon(exec_context);
+ if (r < 0) {
+ if (!context->selinux_context_ignore) {
+ *exit_status = EXIT_SELINUX_CONTEXT;
+ return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
+ }
+ log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
+ }
+ }
+ }
+#endif
+
+#if HAVE_APPARMOR
+ if (use_apparmor && context->apparmor_profile) {
+ r = aa_change_onexec(context->apparmor_profile);
+ if (r < 0 && !context->apparmor_profile_ignore) {
+ *exit_status = EXIT_APPARMOR_PROFILE;
+ return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
+ }
+ }
+#endif
+
+ /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
+ * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
+ * CAP_SETPCAP. */
+ if (prctl(PR_GET_SECUREBITS) != secure_bits) {
+ /* CAP_SETPCAP is required to set securebits. This capability is raised into the
+ * effective set here.
+ * The effective set is overwritten during execve with the following values:
+ * - ambient set (for non-root processes)
+ * - (inheritable | bounding) set for root processes)
+ *
+ * Hence there is no security impact to raise it in the effective set before execve
+ */
+ r = capability_gain_cap_setpcap(NULL);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
+ }
+ if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
+ *exit_status = EXIT_SECUREBITS;
+ return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
+ }
+ }
+
+ if (context_has_no_new_privileges(context))
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+ *exit_status = EXIT_NO_NEW_PRIVILEGES;
+ return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
+ }
+
+#if HAVE_SECCOMP
+ r = apply_address_families(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_ADDRESS_FAMILIES;
+ return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
+ }
+
+ r = apply_memory_deny_write_execute(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
+ }
+
+ r = apply_restrict_realtime(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
+ }
+
+ r = apply_restrict_suid_sgid(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
+ }
+
+ r = apply_restrict_namespaces(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
+ }
+
+ r = apply_protect_sysctl(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
+ }
+
+ r = apply_protect_kernel_modules(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
+ }
+
+ r = apply_protect_kernel_logs(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
+ }
+
+ r = apply_protect_clock(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
+ }
+
+ r = apply_private_devices(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
+ }
+
+ r = apply_syscall_archs(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
+ }
+
+ r = apply_lock_personality(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
+ }
+
+ r = apply_syscall_log(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
+ }
+
+ /* This really should remain the last step before the execve(), to make sure our own code is unaffected
+ * by the filter as little as possible. */
+ r = apply_syscall_filter(unit, context, needs_ambient_hack);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
+ }
+#endif
+
+#if HAVE_LIBBPF
+ r = apply_restrict_filesystems(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
+ }
+#endif
+
+ }
+
+ if (!strv_isempty(context->unset_environment)) {
+ char **ee = NULL;
+
+ ee = strv_env_delete(accum_env, 1, context->unset_environment);
+ if (!ee) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ strv_free_and_replace(accum_env, ee);
+ }
+
+ if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
+ replaced_argv = replace_env_argv(command->argv, accum_env);
+ if (!replaced_argv) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ final_argv = replaced_argv;
+ } else
+ final_argv = command->argv;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *line = NULL;
+
+ line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
+ if (!line) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ log_unit_struct(unit, LOG_DEBUG,
+ "EXECUTABLE=%s", executable,
+ LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
+ }
+
+ if (exec_fd >= 0) {
+ uint8_t hot = 1;
+
+ /* We have finished with all our initializations. Let's now let the manager know that. From this point
+ * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+
+ if (write(exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
+ }
+ }
+
+ r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
+
+ if (exec_fd >= 0) {
+ uint8_t hot = 0;
+
+ /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
+ * that POLLHUP on it no longer means execve() succeeded. */
+
+ if (write(exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
+ }
+ }
+
+ *exit_status = EXIT_EXEC;
+ return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
+static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
+
+int exec_spawn(Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecRuntime *runtime,
+ DynamicCreds *dcreds,
+ pid_t *ret) {
+
+ int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
+ _cleanup_free_ char *subcgroup_path = NULL;
+ _cleanup_strv_free_ char **files_env = NULL;
+ size_t n_storage_fds = 0, n_socket_fds = 0;
+ _cleanup_free_ char *line = NULL;
+ pid_t pid;
+
+ assert(unit);
+ assert(command);
+ assert(context);
+ assert(ret);
+ assert(params);
+ assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
+
+ if (context->std_input == EXEC_INPUT_SOCKET ||
+ context->std_output == EXEC_OUTPUT_SOCKET ||
+ context->std_error == EXEC_OUTPUT_SOCKET) {
+
+ if (params->n_socket_fds > 1)
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
+
+ if (params->n_socket_fds == 0)
+ return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
+
+ socket_fd = params->fds[0];
+ } else {
+ socket_fd = -1;
+ fds = params->fds;
+ n_socket_fds = params->n_socket_fds;
+ n_storage_fds = params->n_storage_fds;
+ }
+
+ r = exec_context_named_iofds(context, params, named_iofds);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
+
+ r = exec_context_load_environment(unit, context, &files_env);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
+
+ line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
+ if (!line)
+ return log_oom();
+
+ /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
+ and, until the next SELinux policy changes, we save further reloads in future children. */
+ mac_selinux_maybe_reload();
+
+ log_unit_struct(unit, LOG_DEBUG,
+ LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
+ "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
+ the mount namespace in the child, but we want to log
+ from the parent, so we need to use the (possibly
+ inaccurate) path here. */
+ LOG_UNIT_INVOCATION_ID(unit));
+
+ if (params->cgroup_path) {
+ r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
+ if (r > 0) { /* We are using a child cgroup */
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+
+ /* Normally we would not propagate the oomd xattrs to children but since we created this
+ * sub-cgroup internally we should do it. */
+ cgroup_oomd_xattr_apply(unit, subcgroup_path);
+ }
+ }
+
+ pid = fork();
+ if (pid < 0)
+ return log_unit_error_errno(unit, errno, "Failed to fork: %m");
+
+ if (pid == 0) {
+ int exit_status;
+
+ r = exec_child(unit,
+ command,
+ context,
+ params,
+ runtime,
+ dcreds,
+ socket_fd,
+ named_iofds,
+ fds,
+ n_socket_fds,
+ n_storage_fds,
+ files_env,
+ unit->manager->user_lookup_fds[1],
+ &exit_status);
+
+ if (r < 0) {
+ const char *status = ASSERT_PTR(
+ exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
+
+ log_unit_struct_errno(unit, LOG_ERR, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_UNIT_INVOCATION_ID(unit),
+ LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
+ status, command->path),
+ "EXECUTABLE=%s", command->path);
+ } else
+ assert(exit_status == EXIT_SUCCESS);
+
+ _exit(exit_status);
+ }
+
+ log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
+
+ /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
+ * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
+ * process will be killed too). */
+ if (subcgroup_path)
+ (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
+
+ exec_status_start(&command->exec_status, pid);
+
+ *ret = pid;
+ return 0;
+}
+
+void exec_context_init(ExecContext *c) {
+ assert(c);
+
+ c->umask = 0022;
+ c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+ c->cpu_sched_policy = SCHED_OTHER;
+ c->syslog_priority = LOG_DAEMON|LOG_INFO;
+ c->syslog_level_prefix = true;
+ c->ignore_sigpipe = true;
+ c->timer_slack_nsec = NSEC_INFINITY;
+ c->personality = PERSONALITY_INVALID;
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
+ c->directories[t].mode = 0755;
+ c->timeout_clean_usec = USEC_INFINITY;
+ c->capability_bounding_set = CAP_ALL;
+ assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
+ c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
+ c->log_level_max = -1;
+#if HAVE_SECCOMP
+ c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
+#endif
+ c->tty_rows = UINT_MAX;
+ c->tty_cols = UINT_MAX;
+ numa_policy_reset(&c->numa_policy);
+}
+
+void exec_context_done(ExecContext *c) {
+ assert(c);
+
+ c->environment = strv_free(c->environment);
+ c->environment_files = strv_free(c->environment_files);
+ c->pass_environment = strv_free(c->pass_environment);
+ c->unset_environment = strv_free(c->unset_environment);
+
+ rlimit_free_all(c->rlimit);
+
+ for (size_t l = 0; l < 3; l++) {
+ c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
+ c->stdio_file[l] = mfree(c->stdio_file[l]);
+ }
+
+ c->working_directory = mfree(c->working_directory);
+ c->root_directory = mfree(c->root_directory);
+ c->root_image = mfree(c->root_image);
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_verity = mfree(c->root_verity);
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+ c->extension_directories = strv_free(c->extension_directories);
+ c->tty_path = mfree(c->tty_path);
+ c->syslog_identifier = mfree(c->syslog_identifier);
+ c->user = mfree(c->user);
+ c->group = mfree(c->group);
+
+ c->supplementary_groups = strv_free(c->supplementary_groups);
+
+ c->pam_name = mfree(c->pam_name);
+
+ c->read_only_paths = strv_free(c->read_only_paths);
+ c->read_write_paths = strv_free(c->read_write_paths);
+ c->inaccessible_paths = strv_free(c->inaccessible_paths);
+ c->exec_paths = strv_free(c->exec_paths);
+ c->no_exec_paths = strv_free(c->no_exec_paths);
+ c->exec_search_path = strv_free(c->exec_search_path);
+
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+ cpu_set_reset(&c->cpu_set);
+ numa_policy_reset(&c->numa_policy);
+
+ c->utmp_id = mfree(c->utmp_id);
+ c->selinux_context = mfree(c->selinux_context);
+ c->apparmor_profile = mfree(c->apparmor_profile);
+ c->smack_process_label = mfree(c->smack_process_label);
+
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+ c->syscall_archs = set_free(c->syscall_archs);
+ c->address_families = set_free(c->address_families);
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
+ exec_directory_done(&c->directories[t]);
+
+ c->log_level_max = -1;
+
+ exec_context_free_log_extra_fields(c);
+
+ c->log_ratelimit_interval_usec = 0;
+ c->log_ratelimit_burst = 0;
+
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+
+ c->network_namespace_path = mfree(c->network_namespace_path);
+ c->ipc_namespace_path = mfree(c->ipc_namespace_path);
+
+ c->log_namespace = mfree(c->log_namespace);
+
+ c->load_credentials = hashmap_free(c->load_credentials);
+ c->set_credentials = hashmap_free(c->set_credentials);
+}
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
+ assert(c);
+
+ if (!runtime_prefix)
+ return 0;
+
+ for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
+ _cleanup_free_ char *p = NULL;
+
+ if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+ p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
+ else
+ p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
+ if (!p)
+ return -ENOMEM;
+
+ /* We execute this synchronously, since we need to be sure this is gone when we start the
+ * service next. */
+ (void) rm_rf(p, REMOVE_ROOT);
+
+ STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
+ _cleanup_free_ char *symlink_abs = NULL;
+
+ if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+ symlink_abs = path_join(runtime_prefix, "private", *symlink);
+ else
+ symlink_abs = path_join(runtime_prefix, *symlink);
+ if (!symlink_abs)
+ return -ENOMEM;
+
+ (void) unlink(symlink_abs);
+ }
+
+ }
+
+ return 0;
+}
+
+int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
+ _cleanup_free_ char *p = NULL;
+
+ assert(c);
+
+ if (!runtime_prefix || !unit)
+ return 0;
+
+ p = path_join(runtime_prefix, "credentials", unit);
+ if (!p)
+ return -ENOMEM;
+
+ /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
+ * unmount it, and afterwards remove the mount point */
+ (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
+ (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
+
+ return 0;
+}
+
+int exec_context_destroy_mount_ns_dir(Unit *u) {
+ _cleanup_free_ char *p = NULL;
+
+ if (!u || !MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+
+ p = path_join("/run/systemd/propagate/", u->id);
+ if (!p)
+ return -ENOMEM;
+
+ /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
+ if (rmdir(p) < 0 && errno != ENOENT)
+ log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
+
+ return 0;
+}
+
+static void exec_command_done(ExecCommand *c) {
+ assert(c);
+
+ c->path = mfree(c->path);
+ c->argv = strv_free(c->argv);
+}
+
+void exec_command_done_array(ExecCommand *c, size_t n) {
+ for (size_t i = 0; i < n; i++)
+ exec_command_done(c+i);
+}
+
+ExecCommand* exec_command_free_list(ExecCommand *c) {
+ ExecCommand *i;
+
+ while ((i = c)) {
+ LIST_REMOVE(command, c, i);
+ exec_command_done(i);
+ free(i);
+ }
+
+ return NULL;
+}
+
+void exec_command_free_array(ExecCommand **c, size_t n) {
+ for (size_t i = 0; i < n; i++)
+ c[i] = exec_command_free_list(c[i]);
+}
+
+void exec_command_reset_status_array(ExecCommand *c, size_t n) {
+ for (size_t i = 0; i < n; i++)
+ exec_status_reset(&c[i].exec_status);
+}
+
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
+ for (size_t i = 0; i < n; i++)
+ LIST_FOREACH(command, z, c[i])
+ exec_status_reset(&z->exec_status);
+}
+
+typedef struct InvalidEnvInfo {
+ const Unit *unit;
+ const char *path;
+} InvalidEnvInfo;
+
+static void invalid_env(const char *p, void *userdata) {
+ InvalidEnvInfo *info = userdata;
+
+ log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
+}
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index) {
+ assert(c);
+
+ switch (fd_index) {
+
+ case STDIN_FILENO:
+ if (c->std_input != EXEC_INPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
+
+ case STDOUT_FILENO:
+ if (c->std_output != EXEC_OUTPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
+
+ case STDERR_FILENO:
+ if (c->std_error != EXEC_OUTPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
+
+ default:
+ return NULL;
+ }
+}
+
+static int exec_context_named_iofds(
+ const ExecContext *c,
+ const ExecParameters *p,
+ int named_iofds[static 3]) {
+
+ size_t targets;
+ const char* stdio_fdname[3];
+ size_t n_fds;
+
+ assert(c);
+ assert(p);
+ assert(named_iofds);
+
+ targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
+ (c->std_output == EXEC_OUTPUT_NAMED_FD) +
+ (c->std_error == EXEC_OUTPUT_NAMED_FD);
+
+ for (size_t i = 0; i < 3; i++)
+ stdio_fdname[i] = exec_context_fdname(c, i);
+
+ n_fds = p->n_storage_fds + p->n_socket_fds;
+
+ for (size_t i = 0; i < n_fds && targets > 0; i++)
+ if (named_iofds[STDIN_FILENO] < 0 &&
+ c->std_input == EXEC_INPUT_NAMED_FD &&
+ stdio_fdname[STDIN_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
+
+ named_iofds[STDIN_FILENO] = p->fds[i];
+ targets--;
+
+ } else if (named_iofds[STDOUT_FILENO] < 0 &&
+ c->std_output == EXEC_OUTPUT_NAMED_FD &&
+ stdio_fdname[STDOUT_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
+
+ named_iofds[STDOUT_FILENO] = p->fds[i];
+ targets--;
+
+ } else if (named_iofds[STDERR_FILENO] < 0 &&
+ c->std_error == EXEC_OUTPUT_NAMED_FD &&
+ stdio_fdname[STDERR_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
+
+ named_iofds[STDERR_FILENO] = p->fds[i];
+ targets--;
+ }
+
+ return targets == 0 ? 0 : -ENOENT;
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
+ _cleanup_strv_free_ char **v = NULL;
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ STRV_FOREACH(i, c->environment_files) {
+ _cleanup_globfree_ glob_t pglob = {};
+ bool ignore = false;
+ char *fn = *i;
+
+ if (fn[0] == '-') {
+ ignore = true;
+ fn++;
+ }
+
+ if (!path_is_absolute(fn)) {
+ if (ignore)
+ continue;
+ return -EINVAL;
+ }
+
+ /* Filename supports globbing, take all matching files */
+ r = safe_glob(fn, 0, &pglob);
+ if (r < 0) {
+ if (ignore)
+ continue;
+ return r;
+ }
+
+ /* When we don't match anything, -ENOENT should be returned */
+ assert(pglob.gl_pathc > 0);
+
+ for (unsigned n = 0; n < pglob.gl_pathc; n++) {
+ _cleanup_strv_free_ char **p = NULL;
+
+ r = load_env_file(NULL, pglob.gl_pathv[n], &p);
+ if (r < 0) {
+ if (ignore)
+ continue;
+ return r;
+ }
+
+ /* Log invalid environment variables with filename */
+ if (p) {
+ InvalidEnvInfo info = {
+ .unit = unit,
+ .path = pglob.gl_pathv[n]
+ };
+
+ p = strv_env_clean_with_callback(p, invalid_env, &info);
+ }
+
+ if (!v)
+ v = TAKE_PTR(p);
+ else {
+ char **m = strv_env_merge(v, p);
+ if (!m)
+ return -ENOMEM;
+
+ strv_free_and_replace(v, m);
+ }
+ }
+ }
+
+ *ret = TAKE_PTR(v);
+
+ return 0;
+}
+
+static bool tty_may_match_dev_console(const char *tty) {
+ _cleanup_free_ char *resolved = NULL;
+
+ if (!tty)
+ return true;
+
+ tty = skip_dev_prefix(tty);
+
+ /* trivial identity? */
+ if (streq(tty, "console"))
+ return true;
+
+ if (resolve_dev_console(&resolved) < 0)
+ return true; /* if we could not resolve, assume it may */
+
+ /* "tty0" means the active VC, so it may be the same sometimes */
+ return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
+}
+
+static bool exec_context_may_touch_tty(const ExecContext *ec) {
+ assert(ec);
+
+ return ec->tty_reset ||
+ ec->tty_vhangup ||
+ ec->tty_vt_disallocate ||
+ is_terminal_input(ec->std_input) ||
+ is_terminal_output(ec->std_output) ||
+ is_terminal_output(ec->std_error);
+}
+
+bool exec_context_may_touch_console(const ExecContext *ec) {
+
+ return exec_context_may_touch_tty(ec) &&
+ tty_may_match_dev_console(exec_context_tty_path(ec));
+}
+
+static void strv_fprintf(FILE *f, char **l) {
+ assert(f);
+
+ STRV_FOREACH(g, l)
+ fprintf(f, " %s", *g);
+}
+
+static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
+ assert(f);
+ assert(prefix);
+ assert(name);
+
+ if (!strv_isempty(strv)) {
+ fprintf(f, "%s%s:", prefix, name);
+ strv_fprintf(f, strv);
+ fputs("\n", f);
+ }
+}
+
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
+ int r;
+
+ assert(c);
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sUMask: %04o\n"
+ "%sWorkingDirectory: %s\n"
+ "%sRootDirectory: %s\n"
+ "%sNonBlocking: %s\n"
+ "%sPrivateTmp: %s\n"
+ "%sPrivateDevices: %s\n"
+ "%sProtectKernelTunables: %s\n"
+ "%sProtectKernelModules: %s\n"
+ "%sProtectKernelLogs: %s\n"
+ "%sProtectClock: %s\n"
+ "%sProtectControlGroups: %s\n"
+ "%sPrivateNetwork: %s\n"
+ "%sPrivateUsers: %s\n"
+ "%sProtectHome: %s\n"
+ "%sProtectSystem: %s\n"
+ "%sMountAPIVFS: %s\n"
+ "%sIgnoreSIGPIPE: %s\n"
+ "%sMemoryDenyWriteExecute: %s\n"
+ "%sRestrictRealtime: %s\n"
+ "%sRestrictSUIDSGID: %s\n"
+ "%sKeyringMode: %s\n"
+ "%sProtectHostname: %s\n"
+ "%sProtectProc: %s\n"
+ "%sProcSubset: %s\n",
+ prefix, c->umask,
+ prefix, empty_to_root(c->working_directory),
+ prefix, empty_to_root(c->root_directory),
+ prefix, yes_no(c->non_blocking),
+ prefix, yes_no(c->private_tmp),
+ prefix, yes_no(c->private_devices),
+ prefix, yes_no(c->protect_kernel_tunables),
+ prefix, yes_no(c->protect_kernel_modules),
+ prefix, yes_no(c->protect_kernel_logs),
+ prefix, yes_no(c->protect_clock),
+ prefix, yes_no(c->protect_control_groups),
+ prefix, yes_no(c->private_network),
+ prefix, yes_no(c->private_users),
+ prefix, protect_home_to_string(c->protect_home),
+ prefix, protect_system_to_string(c->protect_system),
+ prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
+ prefix, yes_no(c->ignore_sigpipe),
+ prefix, yes_no(c->memory_deny_write_execute),
+ prefix, yes_no(c->restrict_realtime),
+ prefix, yes_no(c->restrict_suid_sgid),
+ prefix, exec_keyring_mode_to_string(c->keyring_mode),
+ prefix, yes_no(c->protect_hostname),
+ prefix, protect_proc_to_string(c->protect_proc),
+ prefix, proc_subset_to_string(c->proc_subset));
+
+ if (c->root_image)
+ fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
+
+ if (c->root_image_options) {
+ fprintf(f, "%sRootImageOptions:", prefix);
+ LIST_FOREACH(mount_options, o, c->root_image_options)
+ if (!isempty(o->options))
+ fprintf(f, " %s:%s",
+ partition_designator_to_string(o->partition_designator),
+ o->options);
+ fprintf(f, "\n");
+ }
+
+ if (c->root_hash) {
+ _cleanup_free_ char *encoded = NULL;
+ encoded = hexmem(c->root_hash, c->root_hash_size);
+ if (encoded)
+ fprintf(f, "%sRootHash: %s\n", prefix, encoded);
+ }
+
+ if (c->root_hash_path)
+ fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
+
+ if (c->root_hash_sig) {
+ _cleanup_free_ char *encoded = NULL;
+ ssize_t len;
+ len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
+ if (len)
+ fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
+ }
+
+ if (c->root_hash_sig_path)
+ fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
+
+ if (c->root_verity)
+ fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
+
+ STRV_FOREACH(e, c->environment)
+ fprintf(f, "%sEnvironment: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->environment_files)
+ fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->pass_environment)
+ fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->unset_environment)
+ fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
+
+ fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
+
+ for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+ fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
+
+ STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
+ fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
+ }
+ }
+
+ fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
+
+ if (c->nice_set)
+ fprintf(f, "%sNice: %i\n", prefix, c->nice);
+
+ if (c->oom_score_adjust_set)
+ fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
+
+ if (c->coredump_filter_set)
+ fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
+
+ for (unsigned i = 0; i < RLIM_NLIMITS; i++)
+ if (c->rlimit[i]) {
+ fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
+ prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
+ fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
+ prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
+ }
+
+ if (c->ioprio_set) {
+ _cleanup_free_ char *class_str = NULL;
+
+ r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
+ if (r >= 0)
+ fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
+
+ fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
+ }
+
+ if (c->cpu_sched_set) {
+ _cleanup_free_ char *policy_str = NULL;
+
+ r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
+ if (r >= 0)
+ fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
+
+ fprintf(f,
+ "%sCPUSchedulingPriority: %i\n"
+ "%sCPUSchedulingResetOnFork: %s\n",
+ prefix, c->cpu_sched_priority,
+ prefix, yes_no(c->cpu_sched_reset_on_fork));
+ }
+
+ if (c->cpu_set.set) {
+ _cleanup_free_ char *affinity = NULL;
+
+ affinity = cpu_set_to_range_string(&c->cpu_set);
+ fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
+ }
+
+ if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+ _cleanup_free_ char *nodes = NULL;
+
+ nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+ fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
+ fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
+ }
+
+ if (c->timer_slack_nsec != NSEC_INFINITY)
+ fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
+
+ fprintf(f,
+ "%sStandardInput: %s\n"
+ "%sStandardOutput: %s\n"
+ "%sStandardError: %s\n",
+ prefix, exec_input_to_string(c->std_input),
+ prefix, exec_output_to_string(c->std_output),
+ prefix, exec_output_to_string(c->std_error));
+
+ if (c->std_input == EXEC_INPUT_NAMED_FD)
+ fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_NAMED_FD)
+ fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_NAMED_FD)
+ fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
+
+ if (c->std_input == EXEC_INPUT_FILE)
+ fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE)
+ fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
+ fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE)
+ fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
+ fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+
+ if (c->tty_path)
+ fprintf(f,
+ "%sTTYPath: %s\n"
+ "%sTTYReset: %s\n"
+ "%sTTYVHangup: %s\n"
+ "%sTTYVTDisallocate: %s\n"
+ "%sTTYRows: %u\n"
+ "%sTTYColumns: %u\n",
+ prefix, c->tty_path,
+ prefix, yes_no(c->tty_reset),
+ prefix, yes_no(c->tty_vhangup),
+ prefix, yes_no(c->tty_vt_disallocate),
+ prefix, c->tty_rows,
+ prefix, c->tty_cols);
+
+ if (IN_SET(c->std_output,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
+ IN_SET(c->std_error,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
+
+ _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
+
+ r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
+ if (r >= 0)
+ fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
+
+ r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
+ if (r >= 0)
+ fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
+ }
+
+ if (c->log_level_max >= 0) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) log_level_to_string_alloc(c->log_level_max, &t);
+
+ fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
+ }
+
+ if (c->log_ratelimit_interval_usec > 0)
+ fprintf(f,
+ "%sLogRateLimitIntervalSec: %s\n",
+ prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
+
+ if (c->log_ratelimit_burst > 0)
+ fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
+
+ for (size_t j = 0; j < c->n_log_extra_fields; j++) {
+ fprintf(f, "%sLogExtraFields: ", prefix);
+ fwrite(c->log_extra_fields[j].iov_base,
+ 1, c->log_extra_fields[j].iov_len,
+ f);
+ fputc('\n', f);
+ }
+
+ if (c->log_namespace)
+ fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
+
+ if (c->secure_bits) {
+ _cleanup_free_ char *str = NULL;
+
+ r = secure_bits_to_string_alloc(c->secure_bits, &str);
+ if (r >= 0)
+ fprintf(f, "%sSecure Bits: %s\n", prefix, str);
+ }
+
+ if (c->capability_bounding_set != CAP_ALL) {
+ _cleanup_free_ char *str = NULL;
+
+ r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
+ if (r >= 0)
+ fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
+ }
+
+ if (c->capability_ambient_set != 0) {
+ _cleanup_free_ char *str = NULL;
+
+ r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
+ if (r >= 0)
+ fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
+ }
+
+ if (c->user)
+ fprintf(f, "%sUser: %s\n", prefix, c->user);
+ if (c->group)
+ fprintf(f, "%sGroup: %s\n", prefix, c->group);
+
+ fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
+
+ strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
+
+ if (c->pam_name)
+ fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
+
+ strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
+ strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
+ strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
+ strv_dump(f, prefix, "ExecPaths", c->exec_paths);
+ strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
+ strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
+
+ for (size_t i = 0; i < c->n_bind_mounts; i++)
+ fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
+ c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
+ c->bind_mounts[i].ignore_enoent ? "-": "",
+ c->bind_mounts[i].source,
+ c->bind_mounts[i].destination,
+ c->bind_mounts[i].recursive ? "rbind" : "norbind");
+
+ for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
+ const TemporaryFileSystem *t = c->temporary_filesystems + i;
+
+ fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
+ t->path,
+ isempty(t->options) ? "" : ":",
+ strempty(t->options));
+ }
+
+ if (c->utmp_id)
+ fprintf(f,
+ "%sUtmpIdentifier: %s\n",
+ prefix, c->utmp_id);
+
+ if (c->selinux_context)
+ fprintf(f,
+ "%sSELinuxContext: %s%s\n",
+ prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
+
+ if (c->apparmor_profile)
+ fprintf(f,
+ "%sAppArmorProfile: %s%s\n",
+ prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
+
+ if (c->smack_process_label)
+ fprintf(f,
+ "%sSmackProcessLabel: %s%s\n",
+ prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
+
+ if (c->personality != PERSONALITY_INVALID)
+ fprintf(f,
+ "%sPersonality: %s\n",
+ prefix, strna(personality_to_string(c->personality)));
+
+ fprintf(f,
+ "%sLockPersonality: %s\n",
+ prefix, yes_no(c->lock_personality));
+
+ if (c->syscall_filter) {
+ fprintf(f,
+ "%sSystemCallFilter: ",
+ prefix);
+
+ if (!c->syscall_allow_list)
+ fputc('~', f);
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ bool first = true;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+ _cleanup_free_ char *name = NULL;
+ const char *errno_name = NULL;
+ int num = PTR_TO_INT(val);
+
+ if (first)
+ first = false;
+ else
+ fputc(' ', f);
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ fputs(strna(name), f);
+
+ if (num >= 0) {
+ errno_name = seccomp_errno_or_action_to_string(num);
+ if (errno_name)
+ fprintf(f, ":%s", errno_name);
+ else
+ fprintf(f, ":%d", num);
+ }
+ }
+#endif
+
+ fputc('\n', f);
+ }
+
+ if (c->syscall_archs) {
+ fprintf(f,
+ "%sSystemCallArchitectures:",
+ prefix);
+
+#if HAVE_SECCOMP
+ void *id;
+ SET_FOREACH(id, c->syscall_archs)
+ fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
+#endif
+ fputc('\n', f);
+ }
+
+ if (exec_context_restrict_namespaces_set(c)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = namespace_flags_to_string(c->restrict_namespaces, &s);
+ if (r >= 0)
+ fprintf(f, "%sRestrictNamespaces: %s\n",
+ prefix, strna(s));
+ }
+
+#if HAVE_LIBBPF
+ if (exec_context_restrict_filesystems_set(c)) {
+ char *fs;
+ SET_FOREACH(fs, c->restrict_filesystems)
+ fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
+ }
+#endif
+
+ if (c->network_namespace_path)
+ fprintf(f,
+ "%sNetworkNamespacePath: %s\n",
+ prefix, c->network_namespace_path);
+
+ if (c->syscall_errno > 0) {
+ fprintf(f, "%sSystemCallErrorNumber: ", prefix);
+
+#if HAVE_SECCOMP
+ const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
+ if (errno_name)
+ fputs(errno_name, f);
+ else
+ fprintf(f, "%d", c->syscall_errno);
+#endif
+ fputc('\n', f);
+ }
+
+ for (size_t i = 0; i < c->n_mount_images; i++) {
+ fprintf(f, "%sMountImages: %s%s:%s", prefix,
+ c->mount_images[i].ignore_enoent ? "-": "",
+ c->mount_images[i].source,
+ c->mount_images[i].destination);
+ LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
+ fprintf(f, ":%s:%s",
+ partition_designator_to_string(o->partition_designator),
+ strempty(o->options));
+ fprintf(f, "\n");
+ }
+
+ for (size_t i = 0; i < c->n_extension_images; i++) {
+ fprintf(f, "%sExtensionImages: %s%s", prefix,
+ c->extension_images[i].ignore_enoent ? "-": "",
+ c->extension_images[i].source);
+ LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
+ fprintf(f, ":%s:%s",
+ partition_designator_to_string(o->partition_designator),
+ strempty(o->options));
+ fprintf(f, "\n");
+ }
+
+ strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
+}
+
+bool exec_context_maintains_privileges(const ExecContext *c) {
+ assert(c);
+
+ /* Returns true if the process forked off would run under
+ * an unchanged UID or as root. */
+
+ if (!c->user)
+ return true;
+
+ if (streq(c->user, "root") || streq(c->user, "0"))
+ return true;
+
+ return false;
+}
+
+int exec_context_get_effective_ioprio(const ExecContext *c) {
+ int p;
+
+ assert(c);
+
+ if (c->ioprio_set)
+ return c->ioprio;
+
+ p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
+ if (p < 0)
+ return IOPRIO_DEFAULT_CLASS_AND_PRIO;
+
+ return ioprio_normalize(p);
+}
+
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
+ assert(c);
+
+ /* Explicit setting wins */
+ if (c->mount_apivfs_set)
+ return c->mount_apivfs;
+
+ /* Default to "yes" if root directory or image are specified */
+ if (exec_context_with_rootfs(c))
+ return true;
+
+ return false;
+}
+
+void exec_context_free_log_extra_fields(ExecContext *c) {
+ assert(c);
+
+ for (size_t l = 0; l < c->n_log_extra_fields; l++)
+ free(c->log_extra_fields[l].iov_base);
+ c->log_extra_fields = mfree(c->log_extra_fields);
+ c->n_log_extra_fields = 0;
+}
+
+void exec_context_revert_tty(ExecContext *c) {
+ _cleanup_close_ int fd = -1;
+ const char *path;
+ struct stat st;
+ int r;
+
+ assert(c);
+
+ /* First, reset the TTY (possibly kicking everybody else from the TTY) */
+ exec_context_tty_reset(c, NULL);
+
+ /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
+ * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
+ * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
+ if (!exec_context_may_touch_tty(c))
+ return;
+
+ path = exec_context_tty_path(c);
+ if (!path)
+ return;
+
+ fd = open(path, O_PATH|O_CLOEXEC); /* Pin the inode */
+ if (fd < 0)
+ return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
+ path);
+
+ if (fstat(fd, &st) < 0)
+ return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
+
+ /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
+ * if things are a character device, since a proper check either means we'd have to open the TTY and
+ * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
+ * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
+ * with this at all? → https://github.com/systemd/systemd/issues/19213 */
+ if (!S_ISCHR(st.st_mode))
+ return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
+
+ r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
+ if (r < 0)
+ log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s to " UID_FMT ":" GID_FMT ", ignoring: %m", path, (uid_t) 0, (gid_t) TTY_GID);
+}
+
+int exec_context_get_clean_directories(
+ ExecContext *c,
+ char **prefix,
+ ExecCleanMask mask,
+ char ***ret) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(c);
+ assert(prefix);
+ assert(ret);
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!FLAGS_SET(mask, 1U << t))
+ continue;
+
+ if (!prefix[t])
+ continue;
+
+ for (size_t i = 0; i < c->directories[t].n_items; i++) {
+ char *j;
+
+ j = path_join(prefix[t], c->directories[t].items[i].path);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+
+ /* Also remove private directories unconditionally. */
+ if (t != EXEC_DIRECTORY_CONFIGURATION) {
+ j = path_join(prefix[t], "private", c->directories[t].items[i].path);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+ }
+
+ STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
+ j = path_join(prefix[t], *symlink);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ *ret = TAKE_PTR(l);
+ return 0;
+}
+
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
+ ExecCleanMask mask = 0;
+
+ assert(c);
+ assert(ret);
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
+ if (c->directories[t].n_items > 0)
+ mask |= 1U << t;
+
+ *ret = mask;
+ return 0;
+}
+
+void exec_status_start(ExecStatus *s, pid_t pid) {
+ assert(s);
+
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+
+ dual_timestamp_get(&s->start_timestamp);
+}
+
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
+ assert(s);
+
+ if (s->pid != pid)
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+
+ dual_timestamp_get(&s->exit_timestamp);
+
+ s->code = code;
+ s->status = status;
+
+ if (context && context->utmp_id)
+ (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
+}
+
+void exec_status_reset(ExecStatus *s) {
+ assert(s);
+
+ *s = (ExecStatus) {};
+}
+
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
+ assert(s);
+ assert(f);
+
+ if (s->pid <= 0)
+ return;
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sPID: "PID_FMT"\n",
+ prefix, s->pid);
+
+ if (dual_timestamp_is_set(&s->start_timestamp))
+ fprintf(f,
+ "%sStart Timestamp: %s\n",
+ prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
+
+ if (dual_timestamp_is_set(&s->exit_timestamp))
+ fprintf(f,
+ "%sExit Timestamp: %s\n"
+ "%sExit Code: %s\n"
+ "%sExit Status: %i\n",
+ prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
+ prefix, sigchld_code_to_string(s->code),
+ prefix, s->status);
+}
+
+static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
+ _cleanup_free_ char *cmd = NULL;
+ const char *prefix2;
+
+ assert(c);
+ assert(f);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
+
+ fprintf(f,
+ "%sCommand Line: %s\n",
+ prefix, strnull(cmd));
+
+ exec_status_dump(&c->exec_status, f, prefix2);
+}
+
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ LIST_FOREACH(command, i, c)
+ exec_command_dump(i, f, prefix);
+}
+
+void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
+ ExecCommand *end;
+
+ assert(l);
+ assert(e);
+
+ if (*l) {
+ /* It's kind of important, that we keep the order here */
+ LIST_FIND_TAIL(command, *l, end);
+ LIST_INSERT_AFTER(command, *l, end, e);
+ } else
+ *l = e;
+}
+
+int exec_command_set(ExecCommand *c, const char *path, ...) {
+ va_list ap;
+ char **l, *p;
+
+ assert(c);
+ assert(path);
+
+ va_start(ap, path);
+ l = strv_new_ap(path, ap);
+ va_end(ap);
+
+ if (!l)
+ return -ENOMEM;
+
+ p = strdup(path);
+ if (!p) {
+ strv_free(l);
+ return -ENOMEM;
+ }
+
+ free_and_replace(c->path, p);
+
+ return strv_free_and_replace(c->argv, l);
+}
+
+int exec_command_append(ExecCommand *c, const char *path, ...) {
+ _cleanup_strv_free_ char **l = NULL;
+ va_list ap;
+ int r;
+
+ assert(c);
+ assert(path);
+
+ va_start(ap, path);
+ l = strv_new_ap(path, ap);
+ va_end(ap);
+
+ if (!l)
+ return -ENOMEM;
+
+ r = strv_extend_strv(&c->argv, l, false);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static void *remove_tmpdir_thread(void *p) {
+ _cleanup_free_ char *path = p;
+
+ (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
+ return NULL;
+}
+
+static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
+ int r;
+
+ if (!rt)
+ return NULL;
+
+ if (rt->manager)
+ (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
+
+ /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
+
+ if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
+ log_debug("Spawning thread to nuke %s", rt->tmp_dir);
+
+ r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
+ if (r < 0)
+ log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
+ else
+ rt->tmp_dir = NULL;
+ }
+
+ if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
+ log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
+
+ r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
+ if (r < 0)
+ log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
+ else
+ rt->var_tmp_dir = NULL;
+ }
+
+ rt->id = mfree(rt->id);
+ rt->tmp_dir = mfree(rt->tmp_dir);
+ rt->var_tmp_dir = mfree(rt->var_tmp_dir);
+ safe_close_pair(rt->netns_storage_socket);
+ safe_close_pair(rt->ipcns_storage_socket);
+ return mfree(rt);
+}
+
+static void exec_runtime_freep(ExecRuntime **rt) {
+ (void) exec_runtime_free(*rt, false);
+}
+
+static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
+ _cleanup_free_ char *id_copy = NULL;
+ ExecRuntime *n;
+
+ assert(ret);
+
+ id_copy = strdup(id);
+ if (!id_copy)
+ return -ENOMEM;
+
+ n = new(ExecRuntime, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (ExecRuntime) {
+ .id = TAKE_PTR(id_copy),
+ .netns_storage_socket = { -1, -1 },
+ .ipcns_storage_socket = { -1, -1 },
+ };
+
+ *ret = n;
+ return 0;
+}
+
+static int exec_runtime_add(
+ Manager *m,
+ const char *id,
+ char **tmp_dir,
+ char **var_tmp_dir,
+ int netns_storage_socket[2],
+ int ipcns_storage_socket[2],
+ ExecRuntime **ret) {
+
+ _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+ int r;
+
+ assert(m);
+ assert(id);
+
+ /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
+
+ r = exec_runtime_allocate(&rt, id);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
+ if (r < 0)
+ return r;
+
+ assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
+ rt->tmp_dir = TAKE_PTR(*tmp_dir);
+ rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
+
+ if (netns_storage_socket) {
+ rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
+ rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
+ }
+
+ if (ipcns_storage_socket) {
+ rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
+ rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
+ }
+
+ rt->manager = m;
+
+ if (ret)
+ *ret = rt;
+ /* do not remove created ExecRuntime object when the operation succeeds. */
+ TAKE_PTR(rt);
+ return 0;
+}
+
+static int exec_runtime_make(
+ Manager *m,
+ const ExecContext *c,
+ const char *id,
+ ExecRuntime **ret) {
+
+ _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
+ _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
+ int r;
+
+ assert(m);
+ assert(c);
+ assert(id);
+
+ /* It is not necessary to create ExecRuntime object. */
+ if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
+ *ret = NULL;
+ return 0;
+ }
+
+ if (c->private_tmp &&
+ !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
+ (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
+ prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
+ r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->private_network || c->network_namespace_path) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
+ return -errno;
+ }
+
+ if (c->private_ipc || c->ipc_namespace_path) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
+ return -errno;
+ }
+
+ r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
+ ExecRuntime *rt;
+ int r;
+
+ assert(m);
+ assert(id);
+ assert(ret);
+
+ rt = hashmap_get(m->exec_runtime_by_id, id);
+ if (rt)
+ /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
+ goto ref;
+
+ if (!create) {
+ *ret = NULL;
+ return 0;
+ }
+
+ /* If not found, then create a new object. */
+ r = exec_runtime_make(m, c, id, &rt);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* When r == 0, it is not necessary to create ExecRuntime object. */
+ *ret = NULL;
+ return 0;
+ }
+
+ref:
+ /* increment reference counter. */
+ rt->n_ref++;
+ *ret = rt;
+ return 1;
+}
+
+ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
+ if (!rt)
+ return NULL;
+
+ assert(rt->n_ref > 0);
+
+ rt->n_ref--;
+ if (rt->n_ref > 0)
+ return NULL;
+
+ return exec_runtime_free(rt, destroy);
+}
+
+int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
+ ExecRuntime *rt;
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+ fprintf(f, "exec-runtime=%s", rt->id);
+
+ if (rt->tmp_dir)
+ fprintf(f, " tmp-dir=%s", rt->tmp_dir);
+
+ if (rt->var_tmp_dir)
+ fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
+
+ if (rt->netns_storage_socket[0] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " netns-socket-0=%i", copy);
+ }
+
+ if (rt->netns_storage_socket[1] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " netns-socket-1=%i", copy);
+ }
+
+ if (rt->ipcns_storage_socket[0] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " ipcns-socket-0=%i", copy);
+ }
+
+ if (rt->ipcns_storage_socket[1] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " ipcns-socket-1=%i", copy);
+ }
+
+ fputc('\n', f);
+ }
+
+ return 0;
+}
+
+int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
+ _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
+ ExecRuntime *rt;
+ int r;
+
+ /* This is for the migration from old (v237 or earlier) deserialization text.
+ * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
+ * Even if the ExecRuntime object originally created by the other unit, we cannot judge
+ * so or not from the serialized text, then we always creates a new object owned by this. */
+
+ assert(u);
+ assert(key);
+ assert(value);
+
+ /* Manager manages ExecRuntime objects by the unit id.
+ * So, we omit the serialized text when the unit does not have id (yet?)... */
+ if (isempty(u->id)) {
+ log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
+ return 0;
+ }
+
+ if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
+ return log_oom();
+
+ rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
+ if (!rt) {
+ if (exec_runtime_allocate(&rt_create, u->id) < 0)
+ return log_oom();
+
+ rt = rt_create;
+ }
+
+ if (streq(key, "tmp-dir")) {
+ if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
+ return -ENOMEM;
+
+ } else if (streq(key, "var-tmp-dir")) {
+ if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
+ return -ENOMEM;
+
+ } else if (streq(key, "netns-socket-0")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Failed to parse netns socket value: %s", value);
+ return 0;
+ }
+
+ safe_close(rt->netns_storage_socket[0]);
+ rt->netns_storage_socket[0] = fdset_remove(fds, fd);
+
+ } else if (streq(key, "netns-socket-1")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Failed to parse netns socket value: %s", value);
+ return 0;
+ }
+
+ safe_close(rt->netns_storage_socket[1]);
+ rt->netns_storage_socket[1] = fdset_remove(fds, fd);
+
+ } else
+ return 0;
+
+ /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
+ if (rt_create) {
+ r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
+ return 0;
+ }
+
+ rt_create->manager = u->manager;
+
+ /* Avoid cleanup */
+ TAKE_PTR(rt_create);
+ }
+
+ return 1;
+}
+
+int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+ _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
+ char *id = NULL;
+ int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
+ const char *p, *v = ASSERT_PTR(value);
+ size_t n;
+
+ assert(m);
+ assert(fds);
+
+ n = strcspn(v, " ");
+ id = strndupa_safe(v, n);
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+
+ v = startswith(p, "tmp-dir=");
+ if (v) {
+ n = strcspn(v, " ");
+ tmp_dir = strndup(v, n);
+ if (!tmp_dir)
+ return log_oom();
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "var-tmp-dir=");
+ if (v) {
+ n = strcspn(v, " ");
+ var_tmp_dir = strndup(v, n);
+ if (!var_tmp_dir)
+ return log_oom();
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "netns-socket-0=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ r = safe_atoi(buf, &netns_fdpair[0]);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
+ if (!fdset_contains(fds, netns_fdpair[0]))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
+ "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
+ netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "netns-socket-1=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ r = safe_atoi(buf, &netns_fdpair[1]);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
+ if (!fdset_contains(fds, netns_fdpair[1]))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
+ "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
+ netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "ipcns-socket-0=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ r = safe_atoi(buf, &ipcns_fdpair[0]);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
+ if (!fdset_contains(fds, ipcns_fdpair[0]))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
+ "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
+ ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "ipcns-socket-1=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ r = safe_atoi(buf, &ipcns_fdpair[1]);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
+ if (!fdset_contains(fds, ipcns_fdpair[1]))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
+ "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
+ ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
+ }
+
+finalize:
+ r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add exec-runtime: %m");
+ return 0;
+}
+
+void exec_runtime_vacuum(Manager *m) {
+ ExecRuntime *rt;
+
+ assert(m);
+
+ /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
+
+ HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
+ if (rt->n_ref > 0)
+ continue;
+
+ (void) exec_runtime_free(rt, false);
+ }
+}
+
+void exec_params_clear(ExecParameters *p) {
+ if (!p)
+ return;
+
+ p->environment = strv_free(p->environment);
+ p->fd_names = strv_free(p->fd_names);
+ p->fds = mfree(p->fds);
+ p->exec_fd = safe_close(p->exec_fd);
+}
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
+ if (!sc)
+ return NULL;
+
+ free(sc->id);
+ free(sc->data);
+ return mfree(sc);
+}
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
+ if (!lc)
+ return NULL;
+
+ free(lc->id);
+ free(lc->path);
+ return mfree(lc);
+}
+
+void exec_directory_done(ExecDirectory *d) {
+ if (!d)
+ return;
+
+ for (size_t i = 0; i < d->n_items; i++) {
+ free(d->items[i].path);
+ strv_free(d->items[i].symlinks);
+ }
+
+ d->items = mfree(d->items);
+ d->n_items = 0;
+ d->mode = 0755;
+}
+
+static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
+ assert(d);
+ assert(path);
+
+ for (size_t i = 0; i < d->n_items; i++)
+ if (path_equal(d->items[i].path, path))
+ return &d->items[i];
+
+ return NULL;
+}
+
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
+ _cleanup_strv_free_ char **s = NULL;
+ _cleanup_free_ char *p = NULL;
+ ExecDirectoryItem *existing;
+ int r;
+
+ assert(d);
+ assert(path);
+
+ existing = exec_directory_find(d, path);
+ if (existing) {
+ r = strv_extend(&existing->symlinks, symlink);
+ if (r < 0)
+ return r;
+
+ return 0; /* existing item is updated */
+ }
+
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+
+ if (symlink) {
+ s = strv_new(symlink);
+ if (!s)
+ return -ENOMEM;
+ }
+
+ if (!GREEDY_REALLOC(d->items, d->n_items + 1))
+ return -ENOMEM;
+
+ d->items[d->n_items++] = (ExecDirectoryItem) {
+ .path = TAKE_PTR(p),
+ .symlinks = TAKE_PTR(s),
+ };
+
+ return 1; /* new item is added */
+}
+
+static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
+ assert(a);
+ assert(b);
+
+ return path_compare(a->path, b->path);
+}
+
+void exec_directory_sort(ExecDirectory *d) {
+ assert(d);
+
+ /* Sort the exec directories to make always parent directories processed at first in
+ * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
+ * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
+ * list. See also comments in setup_exec_directory() and issue #24783. */
+
+ if (d->n_items <= 1)
+ return;
+
+ typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
+
+ for (size_t i = 1; i < d->n_items; i++)
+ for (size_t j = 0; j < i; j++)
+ if (path_startswith(d->items[i].path, d->items[j].path)) {
+ d->items[i].only_create = true;
+ break;
+ }
+}
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
+
+static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
+ [EXEC_INPUT_NULL] = "null",
+ [EXEC_INPUT_TTY] = "tty",
+ [EXEC_INPUT_TTY_FORCE] = "tty-force",
+ [EXEC_INPUT_TTY_FAIL] = "tty-fail",
+ [EXEC_INPUT_SOCKET] = "socket",
+ [EXEC_INPUT_NAMED_FD] = "fd",
+ [EXEC_INPUT_DATA] = "data",
+ [EXEC_INPUT_FILE] = "file",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
+
+static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
+ [EXEC_OUTPUT_INHERIT] = "inherit",
+ [EXEC_OUTPUT_NULL] = "null",
+ [EXEC_OUTPUT_TTY] = "tty",
+ [EXEC_OUTPUT_KMSG] = "kmsg",
+ [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
+ [EXEC_OUTPUT_JOURNAL] = "journal",
+ [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
+ [EXEC_OUTPUT_SOCKET] = "socket",
+ [EXEC_OUTPUT_NAMED_FD] = "fd",
+ [EXEC_OUTPUT_FILE] = "file",
+ [EXEC_OUTPUT_FILE_APPEND] = "append",
+ [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
+
+static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
+ [EXEC_UTMP_INIT] = "init",
+ [EXEC_UTMP_LOGIN] = "login",
+ [EXEC_UTMP_USER] = "user",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
+
+static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
+ [EXEC_PRESERVE_NO] = "no",
+ [EXEC_PRESERVE_YES] = "yes",
+ [EXEC_PRESERVE_RESTART] = "restart",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
+
+/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
+static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
+ [EXEC_DIRECTORY_STATE] = "StateDirectory",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
+ [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
+
+/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
+static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
+ [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
+ [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
+
+/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
+ * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
+ * directories, specifically .timer units with their timestamp touch file. */
+static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "runtime",
+ [EXEC_DIRECTORY_STATE] = "state",
+ [EXEC_DIRECTORY_CACHE] = "cache",
+ [EXEC_DIRECTORY_LOGS] = "logs",
+ [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
+
+/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
+ * the service payload in. */
+static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
+ [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
+ [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
+ [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
+ [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
+
+static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
+ [EXEC_KEYRING_INHERIT] = "inherit",
+ [EXEC_KEYRING_PRIVATE] = "private",
+ [EXEC_KEYRING_SHARED] = "shared",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
diff --git a/src/core/execute.h b/src/core/execute.h
new file mode 100644
index 0000000..4c54422
--- /dev/null
+++ b/src/core/execute.h
@@ -0,0 +1,527 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ExecStatus ExecStatus;
+typedef struct ExecCommand ExecCommand;
+typedef struct ExecContext ExecContext;
+typedef struct ExecRuntime ExecRuntime;
+typedef struct ExecParameters ExecParameters;
+typedef struct Manager Manager;
+
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/capability.h>
+
+#include "cgroup-util.h"
+#include "coredump-util.h"
+#include "cpu-set-util.h"
+#include "exec-util.h"
+#include "fdset.h"
+#include "list.h"
+#include "missing_resource.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "numa-util.h"
+#include "path-util.h"
+#include "time-util.h"
+
+#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U)
+
+typedef enum ExecUtmpMode {
+ EXEC_UTMP_INIT,
+ EXEC_UTMP_LOGIN,
+ EXEC_UTMP_USER,
+ _EXEC_UTMP_MODE_MAX,
+ _EXEC_UTMP_MODE_INVALID = -EINVAL,
+} ExecUtmpMode;
+
+typedef enum ExecInput {
+ EXEC_INPUT_NULL,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL,
+ EXEC_INPUT_SOCKET,
+ EXEC_INPUT_NAMED_FD,
+ EXEC_INPUT_DATA,
+ EXEC_INPUT_FILE,
+ _EXEC_INPUT_MAX,
+ _EXEC_INPUT_INVALID = -EINVAL,
+} ExecInput;
+
+typedef enum ExecOutput {
+ EXEC_OUTPUT_INHERIT,
+ EXEC_OUTPUT_NULL,
+ EXEC_OUTPUT_TTY,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_SOCKET,
+ EXEC_OUTPUT_NAMED_FD,
+ EXEC_OUTPUT_FILE,
+ EXEC_OUTPUT_FILE_APPEND,
+ EXEC_OUTPUT_FILE_TRUNCATE,
+ _EXEC_OUTPUT_MAX,
+ _EXEC_OUTPUT_INVALID = -EINVAL,
+} ExecOutput;
+
+typedef enum ExecPreserveMode {
+ EXEC_PRESERVE_NO,
+ EXEC_PRESERVE_YES,
+ EXEC_PRESERVE_RESTART,
+ _EXEC_PRESERVE_MODE_MAX,
+ _EXEC_PRESERVE_MODE_INVALID = -EINVAL,
+} ExecPreserveMode;
+
+typedef enum ExecKeyringMode {
+ EXEC_KEYRING_INHERIT,
+ EXEC_KEYRING_PRIVATE,
+ EXEC_KEYRING_SHARED,
+ _EXEC_KEYRING_MODE_MAX,
+ _EXEC_KEYRING_MODE_INVALID = -EINVAL,
+} ExecKeyringMode;
+
+/* Contains start and exit information about an executed command. */
+struct ExecStatus {
+ dual_timestamp start_timestamp;
+ dual_timestamp exit_timestamp;
+ pid_t pid;
+ int code; /* as in siginfo_t::si_code */
+ int status; /* as in siginfo_t::si_status */
+};
+
+/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */
+struct ExecCommand {
+ char *path;
+ char **argv;
+ ExecStatus exec_status;
+ ExecCommandFlags flags;
+ LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */
+};
+
+/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate
+ * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces
+ * between invocations of commands. This is a reference counted object, with one reference taken by each currently
+ * active command invocation that wants to share this runtime. */
+struct ExecRuntime {
+ unsigned n_ref;
+
+ Manager *manager;
+
+ char *id; /* Unit id of the owner */
+
+ char *tmp_dir;
+ char *var_tmp_dir;
+
+ /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network
+ * namespace. */
+ int netns_storage_socket[2];
+
+ /* Like netns_storage_socket, but the file descriptor is referring to the IPC namespace. */
+ int ipcns_storage_socket[2];
+};
+
+typedef enum ExecDirectoryType {
+ EXEC_DIRECTORY_RUNTIME = 0,
+ EXEC_DIRECTORY_STATE,
+ EXEC_DIRECTORY_CACHE,
+ EXEC_DIRECTORY_LOGS,
+ EXEC_DIRECTORY_CONFIGURATION,
+ _EXEC_DIRECTORY_TYPE_MAX,
+ _EXEC_DIRECTORY_TYPE_INVALID = -EINVAL,
+} ExecDirectoryType;
+
+typedef struct ExecDirectoryItem {
+ char *path;
+ char **symlinks;
+ bool only_create;
+} ExecDirectoryItem;
+
+typedef struct ExecDirectory {
+ mode_t mode;
+ size_t n_items;
+ ExecDirectoryItem *items;
+} ExecDirectory;
+
+typedef enum ExecCleanMask {
+ /* In case you wonder why the bitmask below doesn't use "directory" in its name: we want to keep this
+ * generic so that .timer timestamp files can nicely be covered by this too, and similar. */
+ EXEC_CLEAN_RUNTIME = 1U << EXEC_DIRECTORY_RUNTIME,
+ EXEC_CLEAN_STATE = 1U << EXEC_DIRECTORY_STATE,
+ EXEC_CLEAN_CACHE = 1U << EXEC_DIRECTORY_CACHE,
+ EXEC_CLEAN_LOGS = 1U << EXEC_DIRECTORY_LOGS,
+ EXEC_CLEAN_CONFIGURATION = 1U << EXEC_DIRECTORY_CONFIGURATION,
+ EXEC_CLEAN_NONE = 0,
+ EXEC_CLEAN_ALL = (1U << _EXEC_DIRECTORY_TYPE_MAX) - 1,
+ _EXEC_CLEAN_MASK_INVALID = -EINVAL,
+} ExecCleanMask;
+
+/* A credential configured with LoadCredential= */
+typedef struct ExecLoadCredential {
+ char *id, *path;
+ bool encrypted;
+} ExecLoadCredential;
+
+/* A credential configured with SetCredential= */
+typedef struct ExecSetCredential {
+ char *id;
+ bool encrypted;
+ void *data;
+ size_t size;
+} ExecSetCredential;
+
+/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration
+ * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not
+ * change after being loaded. */
+struct ExecContext {
+ char **environment;
+ char **environment_files;
+ char **pass_environment;
+ char **unset_environment;
+
+ struct rlimit *rlimit[_RLIMIT_MAX];
+ char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path;
+ void *root_hash, *root_hash_sig;
+ size_t root_hash_size, root_hash_sig_size;
+ LIST_HEAD(MountOptions, root_image_options);
+ bool working_directory_missing_ok:1;
+ bool working_directory_home:1;
+
+ bool oom_score_adjust_set:1;
+ bool coredump_filter_set:1;
+ bool nice_set:1;
+ bool ioprio_set:1;
+ bool cpu_sched_set:1;
+ bool mount_apivfs_set:1;
+
+ /* This is not exposed to the user but available internally. We need it to make sure that whenever we
+ * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects
+ * that it belongs to us and we don't enter a trigger loop. */
+ bool same_pgrp;
+
+ bool cpu_sched_reset_on_fork;
+ bool non_blocking;
+
+ mode_t umask;
+ int oom_score_adjust;
+ int nice;
+ int ioprio;
+ int cpu_sched_policy;
+ int cpu_sched_priority;
+ uint64_t coredump_filter;
+
+ CPUSet cpu_set;
+ NUMAPolicy numa_policy;
+ bool cpu_affinity_from_numa;
+
+ ExecInput std_input;
+ ExecOutput std_output;
+ ExecOutput std_error;
+ bool stdio_as_fds;
+ char *stdio_fdname[3];
+ char *stdio_file[3];
+
+ void *stdin_data;
+ size_t stdin_data_size;
+
+ nsec_t timer_slack_nsec;
+
+ char *tty_path;
+
+ bool tty_reset;
+ bool tty_vhangup;
+ bool tty_vt_disallocate;
+
+ unsigned tty_rows;
+ unsigned tty_cols;
+
+ bool ignore_sigpipe;
+
+ ExecKeyringMode keyring_mode;
+
+ /* Since resolving these names might involve socket
+ * connections and we don't want to deadlock ourselves these
+ * names are resolved on execution only and in the child
+ * process. */
+ char *user;
+ char *group;
+ char **supplementary_groups;
+
+ char *pam_name;
+
+ char *utmp_id;
+ ExecUtmpMode utmp_mode;
+
+ bool no_new_privileges;
+
+ bool selinux_context_ignore;
+ bool apparmor_profile_ignore;
+ bool smack_process_label_ignore;
+
+ char *selinux_context;
+ char *apparmor_profile;
+ char *smack_process_label;
+
+ char **read_write_paths, **read_only_paths, **inaccessible_paths, **exec_paths, **no_exec_paths;
+ char **exec_search_path;
+ unsigned long mount_flags;
+ BindMount *bind_mounts;
+ size_t n_bind_mounts;
+ TemporaryFileSystem *temporary_filesystems;
+ size_t n_temporary_filesystems;
+ MountImage *mount_images;
+ size_t n_mount_images;
+ MountImage *extension_images;
+ size_t n_extension_images;
+ char **extension_directories;
+
+ uint64_t capability_bounding_set;
+ uint64_t capability_ambient_set;
+ int secure_bits;
+
+ int syslog_priority;
+ bool syslog_level_prefix;
+ char *syslog_identifier;
+
+ struct iovec* log_extra_fields;
+ size_t n_log_extra_fields;
+
+ usec_t log_ratelimit_interval_usec;
+ unsigned log_ratelimit_burst;
+
+ int log_level_max;
+
+ char *log_namespace;
+
+ ProtectProc protect_proc; /* hidepid= */
+ ProcSubset proc_subset; /* subset= */
+
+ bool private_tmp;
+ bool private_network;
+ bool private_devices;
+ bool private_users;
+ bool private_mounts;
+ bool private_ipc;
+ bool protect_kernel_tunables;
+ bool protect_kernel_modules;
+ bool protect_kernel_logs;
+ bool protect_clock;
+ bool protect_control_groups;
+ ProtectSystem protect_system;
+ ProtectHome protect_home;
+ bool protect_hostname;
+ bool mount_apivfs;
+
+ bool dynamic_user;
+ bool remove_ipc;
+
+ bool memory_deny_write_execute;
+ bool restrict_realtime;
+ bool restrict_suid_sgid;
+
+ bool lock_personality;
+ unsigned long personality;
+
+ unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
+
+ Set *restrict_filesystems;
+ bool restrict_filesystems_allow_list:1;
+
+ Hashmap *syscall_filter;
+ Set *syscall_archs;
+ int syscall_errno;
+ bool syscall_allow_list:1;
+
+ Hashmap *syscall_log;
+ bool syscall_log_allow_list:1; /* Log listed system calls */
+
+ bool address_families_allow_list:1;
+ Set *address_families;
+
+ char *network_namespace_path;
+ char *ipc_namespace_path;
+
+ ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX];
+ ExecPreserveMode runtime_directory_preserve_mode;
+ usec_t timeout_clean_usec;
+
+ Hashmap *set_credentials; /* output id → ExecSetCredential */
+ Hashmap *load_credentials; /* output id → ExecLoadCredential */
+};
+
+static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
+ assert(c);
+
+ return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
+}
+
+static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) {
+ assert(c);
+
+ return c->restrict_filesystems_allow_list ||
+ !set_isempty(c->restrict_filesystems);
+}
+
+static inline bool exec_context_with_rootfs(const ExecContext *c) {
+ assert(c);
+
+ /* Checks if RootDirectory= or RootImage= are used */
+
+ return !empty_or_root(c->root_directory) || c->root_image;
+}
+
+typedef enum ExecFlags {
+ EXEC_APPLY_SANDBOXING = 1 << 0,
+ EXEC_APPLY_CHROOT = 1 << 1,
+ EXEC_APPLY_TTY_STDIN = 1 << 2,
+ EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
+ EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
+ EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
+ EXEC_CGROUP_DELEGATE = 1 << 6,
+ EXEC_IS_CONTROL = 1 << 7,
+ EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
+ EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+
+ /* The following are not used by execute.c, but by consumers internally */
+ EXEC_PASS_FDS = 1 << 10,
+ EXEC_SETENV_RESULT = 1 << 11,
+ EXEC_SET_WATCHDOG = 1 << 12,
+ EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
+} ExecFlags;
+
+/* Parameters for a specific invocation of a command. This structure is put together right before a command is
+ * executed. */
+struct ExecParameters {
+ char **environment;
+
+ int *fds;
+ char **fd_names;
+ size_t n_socket_fds;
+ size_t n_storage_fds;
+
+ ExecFlags flags;
+ bool selinux_context_net:1;
+
+ CGroupMask cgroup_supported;
+ const char *cgroup_path;
+
+ char **prefix;
+ const char *received_credentials_directory;
+ const char *received_encrypted_credentials_directory;
+
+ const char *confirm_spawn;
+
+ usec_t watchdog_usec;
+
+ int *idle_pipe;
+
+ int stdin_fd;
+ int stdout_fd;
+ int stderr_fd;
+
+ /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+ int exec_fd;
+
+ const char *notify_socket;
+};
+
+#include "unit.h"
+#include "dynamic-user.h"
+
+int exec_spawn(Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ const ExecParameters *exec_params,
+ ExecRuntime *runtime,
+ DynamicCreds *dynamic_creds,
+ pid_t *ret);
+
+void exec_command_done_array(ExecCommand *c, size_t n);
+ExecCommand* exec_command_free_list(ExecCommand *c);
+void exec_command_free_array(ExecCommand **c, size_t n);
+void exec_command_reset_status_array(ExecCommand *c, size_t n);
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n);
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix);
+void exec_command_append_list(ExecCommand **l, ExecCommand *e);
+int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_;
+int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_;
+
+void exec_context_init(ExecContext *c);
+void exec_context_done(ExecContext *c);
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix);
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root);
+int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_root, const char *unit);
+int exec_context_destroy_mount_ns_dir(Unit *u);
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index);
+
+bool exec_context_may_touch_console(const ExecContext *c);
+bool exec_context_maintains_privileges(const ExecContext *c);
+
+int exec_context_get_effective_ioprio(const ExecContext *c);
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c);
+
+void exec_context_free_log_extra_fields(ExecContext *c);
+
+void exec_context_revert_tty(ExecContext *c);
+
+int exec_context_get_clean_directories(ExecContext *c, char **prefix, ExecCleanMask mask, char ***ret);
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret);
+
+void exec_status_start(ExecStatus *s, pid_t pid);
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
+void exec_status_reset(ExecStatus *s);
+
+int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecRuntime **ret);
+ExecRuntime *exec_runtime_unref(ExecRuntime *r, bool destroy);
+
+int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds);
+int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds);
+int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
+void exec_runtime_vacuum(Manager *m);
+
+void exec_params_clear(ExecParameters *p);
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c);
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSetCredential*, exec_set_credential_free);
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free);
+
+void exec_directory_done(ExecDirectory *d);
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink);
+void exec_directory_sort(ExecDirectory *d);
+
+extern const struct hash_ops exec_set_credential_hash_ops;
+extern const struct hash_ops exec_load_credential_hash_ops;
+
+const char* exec_output_to_string(ExecOutput i) _const_;
+ExecOutput exec_output_from_string(const char *s) _pure_;
+
+const char* exec_input_to_string(ExecInput i) _const_;
+ExecInput exec_input_from_string(const char *s) _pure_;
+
+const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_;
+ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_;
+
+const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_;
+ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_;
+
+const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_;
+ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_symlink_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_symlink_from_string(const char *s) _pure_;
+
+const char* exec_resource_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
+
+bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
diff --git a/src/core/fuzz-unit-file.c b/src/core/fuzz-unit-file.c
new file mode 100644
index 0000000..7b39338
--- /dev/null
+++ b/src/core/fuzz-unit-file.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "install.h"
+#include "load-fragment.h"
+#include "manager-dump.h"
+#include "string-util.h"
+#include "unit-serialize.h"
+#include "utf8.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ _cleanup_free_ char *out = NULL; /* out should be freed after g */
+ size_t out_size;
+ _cleanup_fclose_ FILE *f = NULL, *g = NULL;
+ _cleanup_free_ char *p = NULL;
+ UnitType t;
+ _cleanup_(manager_freep) Manager *m = NULL;
+ Unit *u;
+ const char *name;
+ long offset;
+
+ if (outside_size_range(size, 0, 65536))
+ return 0;
+
+ f = data_to_file(data, size);
+
+ assert_se(f);
+
+ if (read_line(f, LINE_MAX, &p) < 0)
+ return 0;
+
+ t = unit_type_from_string(p);
+ if (t < 0)
+ return 0;
+
+ if (!unit_vtable[t]->load)
+ return 0;
+
+ offset = ftell(f);
+ assert_se(offset >= 0);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *ll;
+
+ if (read_line(f, LONG_LINE_MAX, &l) <= 0)
+ break;
+
+ ll = startswith(l, UTF8_BYTE_ORDER_MARK) ?: l;
+ ll = ll + strspn(ll, WHITESPACE);
+
+ if (HAS_FEATURE_MEMORY_SANITIZER && startswith(ll, "ListenNetlink")) {
+ /* ListenNetlink causes a false positive in msan,
+ * let's skip this for now. */
+ log_notice("Skipping test because ListenNetlink= is present");
+ return 0;
+ }
+ }
+
+ assert_se(fseek(f, offset, SEEK_SET) == 0);
+
+ /* We don't want to fill the logs with messages about parse errors.
+ * Disable most logging if not running standalone */
+ if (!getenv("SYSTEMD_LOG_LEVEL"))
+ log_set_max_level(LOG_CRIT);
+
+ assert_se(manager_new(LOOKUP_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL, &m) >= 0);
+
+ name = strjoina("a.", unit_type_to_string(t));
+ assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0);
+
+ (void) config_parse(
+ name, name, f,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0,
+ u,
+ NULL);
+
+ g = open_memstream_unlocked(&out, &out_size);
+ assert_se(g);
+
+ unit_dump(u, g, "");
+ manager_dump(m, g, /* patterns= */ NULL, ">>>");
+
+ return 0;
+}
diff --git a/src/core/fuzz-unit-file.options b/src/core/fuzz-unit-file.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/core/fuzz-unit-file.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c
new file mode 100644
index 0000000..00d6ad6
--- /dev/null
+++ b/src/core/generator-setup.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "generator-setup.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "rm-rf.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p) {
+ int r, q;
+
+ assert(p);
+
+ if (!p->generator || !p->generator_early || !p->generator_late)
+ return -EINVAL;
+
+ r = mkdir_p_label(p->generator, 0755);
+
+ q = mkdir_p_label(p->generator_early, 0755);
+ if (q < 0 && r >= 0)
+ r = q;
+
+ q = mkdir_p_label(p->generator_late, 0755);
+ if (q < 0 && r >= 0)
+ r = q;
+
+ return r;
+}
+
+void lookup_paths_trim_generator(LookupPaths *p) {
+ assert(p);
+
+ /* Trim empty dirs */
+
+ if (p->generator)
+ (void) rmdir(p->generator);
+ if (p->generator_early)
+ (void) rmdir(p->generator_early);
+ if (p->generator_late)
+ (void) rmdir(p->generator_late);
+}
+
+void lookup_paths_flush_generator(LookupPaths *p) {
+ assert(p);
+
+ /* Flush the generated unit files in full */
+
+ if (p->generator)
+ (void) rm_rf(p->generator, REMOVE_ROOT|REMOVE_PHYSICAL);
+ if (p->generator_early)
+ (void) rm_rf(p->generator_early, REMOVE_ROOT|REMOVE_PHYSICAL);
+ if (p->generator_late)
+ (void) rm_rf(p->generator_late, REMOVE_ROOT|REMOVE_PHYSICAL);
+
+ if (p->temporary_dir)
+ (void) rm_rf(p->temporary_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
+}
diff --git a/src/core/generator-setup.h b/src/core/generator-setup.h
new file mode 100644
index 0000000..1cc816b
--- /dev/null
+++ b/src/core/generator-setup.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "path-lookup.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p);
+void lookup_paths_trim_generator(LookupPaths *p);
+void lookup_paths_flush_generator(LookupPaths *p);
diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c
new file mode 100644
index 0000000..7f517a0
--- /dev/null
+++ b/src/core/ima-setup.c
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+ TORSEC group — http://security.polito.it
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "ima-setup.h"
+#include "log.h"
+
+#define IMA_SECFS_DIR "/sys/kernel/security/ima"
+#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy"
+#define IMA_POLICY_PATH "/etc/ima/ima-policy"
+
+int ima_setup(void) {
+#if ENABLE_IMA
+ _cleanup_fclose_ FILE *input = NULL;
+ _cleanup_close_ int imafd = -1;
+ unsigned lineno = 0;
+ int r;
+
+ if (access(IMA_SECFS_DIR, F_OK) < 0) {
+ log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m");
+ return 0;
+ }
+
+ if (access(IMA_SECFS_POLICY, W_OK) < 0) {
+ log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m");
+ return 0;
+ }
+
+ if (access(IMA_POLICY_PATH, F_OK) < 0) {
+ log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+ return 0;
+ }
+
+ imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+ if (imafd < 0) {
+ log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+ return 0;
+ }
+
+ /* attempt to write the name of the policy file into sysfs file */
+ if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0)
+ goto done;
+
+ /* fall back to copying the policy line-by-line */
+ input = fopen(IMA_POLICY_PATH, "re");
+ if (!input) {
+ log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+ return 0;
+ }
+
+ safe_close(imafd);
+
+ imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+ if (imafd < 0) {
+ log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+ return 0;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ size_t len;
+
+ r = read_line(input, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m");
+ if (r == 0)
+ break;
+
+ len = strlen(line);
+ lineno++;
+
+ if (len > 0 && write(imafd, line, len) < 0)
+ return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m",
+ lineno);
+ }
+
+done:
+ log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH".");
+#endif /* ENABLE_IMA */
+ return 0;
+}
diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h
new file mode 100644
index 0000000..f964c7b
--- /dev/null
+++ b/src/core/ima-setup.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+ TORSEC group — http://security.polito.it
+***/
+
+int ima_setup(void);
diff --git a/src/core/import-creds.c b/src/core/import-creds.c
new file mode 100644
index 0000000..dab7d36
--- /dev/null
+++ b/src/core/import-creds.c
@@ -0,0 +1,716 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+
+#include "copy.h"
+#include "creds-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "import-creds.h"
+#include "io-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "recurse-dir.h"
+#include "strv.h"
+#include "virt.h"
+
+/* This imports credentials passed in from environments higher up (VM manager, boot loader, …) and rearranges
+ * them so that later code can access them using our regular credential protocol
+ * (i.e. $CREDENTIALS_DIRECTORY). It's supposed to be minimal glue to unify behaviour how PID 1 (and
+ * generators invoked by it) can acquire credentials from outside, to mimic how we support it for containers,
+ * but on VM/physical environments.
+ *
+ * This does four things:
+ *
+ * 1. It imports credentials picked up by sd-boot (and placed in the /.extra/credentials/ dir in the initrd)
+ * and puts them in /run/credentials/@encrypted/. Note that during the initrd→host transition the initrd root
+ * file system is cleaned out, thus it is essential we pick up these files before they are deleted. Note
+ * that these credentials originate from an untrusted source, i.e. the ESP and are not
+ * pre-authenticated. They still have to be authenticated before use.
+ *
+ * 2. It imports credentials from /proc/cmdline and puts them in /run/credentials/@system/. These come from a
+ * trusted environment (i.e. the boot loader), and are typically authenticated (if authentication is done
+ * at all). However, they are world-readable, which might be less than ideal. Hence only use this for data
+ * that doesn't require trust.
+ *
+ * 3. It imports credentials passed in through qemu's fw_cfg logic. Specifically, credential data passed in
+ * /sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials/ is picked up and also placed in
+ * /run/credentials/@system/.
+ *
+ * 4. It imports credentials passed in via the DMI/SMBIOS OEM string tables, quite similar to fw_cfg. It
+ * looks for strings starting with "io.systemd.credential:" and "io.systemd.credential.binary:". Both
+ * expect a key=value assignment, but in the latter case the value is Base64 decoded, allowing binary
+ * credentials to be passed in.
+ *
+ * If it picked up any credentials it will set the $CREDENTIALS_DIRECTORY and
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables to point to these directories, so that processes
+ * can find them there later on. If "ramfs" is available $CREDENTIALS_DIRECTORY will be backed by it (but
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY is just a regular tmpfs).
+ *
+ * Net result: the service manager can pick up trusted credentials from $CREDENTIALS_DIRECTORY afterwards,
+ * and untrusted ones from $ENCRYPTED_CREDENTIALS_DIRECTORY. */
+
+typedef struct ImportCredentialContext {
+ int target_dir_fd;
+ size_t size_sum;
+ unsigned n_credentials;
+} ImportCredentialContext;
+
+static void import_credentials_context_free(ImportCredentialContext *c) {
+ assert(c);
+
+ c->target_dir_fd = safe_close(c->target_dir_fd);
+}
+
+static int acquire_encrypted_credential_directory(ImportCredentialContext *c) {
+ int r;
+
+ assert(c);
+
+ if (c->target_dir_fd >= 0)
+ return c->target_dir_fd;
+
+ r = mkdir_safe_label(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, 0700, 0, 0, MKDIR_WARN_MODE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create " ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY ": %m");
+
+ c->target_dir_fd = open(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (c->target_dir_fd < 0)
+ return log_error_errno(errno, "Failed to open " ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY ": %m");
+
+ return c->target_dir_fd;
+}
+
+static int open_credential_file_for_write(int target_dir_fd, const char *dir_name, const char *n) {
+ int fd;
+
+ assert(target_dir_fd >= 0);
+ assert(dir_name);
+ assert(n);
+
+ fd = openat(target_dir_fd, n, O_WRONLY|O_CLOEXEC|O_CREAT|O_EXCL|O_NOFOLLOW, 0400);
+ if (fd < 0) {
+ if (errno == EEXIST) /* In case of EEXIST we'll only debug log! */
+ return log_debug_errno(errno, "Credential '%s' set twice, ignoring.", n);
+
+ return log_error_errno(errno, "Failed to create %s/%s: %m", dir_name, n);
+ }
+
+ return fd;
+}
+
+static bool credential_size_ok(ImportCredentialContext *c, const char *name, uint64_t size) {
+ assert(c);
+ assert(name);
+
+ if (size > CREDENTIAL_SIZE_MAX) {
+ log_warning("Credential '%s' is larger than allowed limit (%s > %s), skipping.", name, FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIAL_SIZE_MAX));
+ return false;
+ }
+
+ if (size > CREDENTIALS_TOTAL_SIZE_MAX - c->size_sum) {
+ log_warning("Accumulated credential size would be above allowed limit (%s+%s > %s), skipping '%s'.",
+ FORMAT_BYTES(c->size_sum), FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIALS_TOTAL_SIZE_MAX), name);
+ return false;
+ }
+
+ return true;
+}
+
+static int finalize_credentials_dir(const char *dir, const char *envvar) {
+ int r;
+
+ assert(dir);
+ assert(envvar);
+
+ /* Try to make the credentials directory read-only now */
+
+ r = make_mount_point(dir);
+ if (r < 0)
+ log_warning_errno(r, "Failed to make '%s' a mount point, ignoring: %m", dir);
+ else
+ (void) mount_nofollow_verbose(LOG_WARNING, NULL, dir, NULL, MS_BIND|MS_NODEV|MS_NOEXEC|MS_NOSUID|MS_RDONLY|MS_REMOUNT, NULL);
+
+ if (setenv(envvar, dir, /* overwrite= */ true) < 0)
+ return log_error_errno(errno, "Failed to set $%s environment variable: %m", envvar);
+
+ return 0;
+}
+
+static int import_credentials_boot(void) {
+ _cleanup_(import_credentials_context_free) ImportCredentialContext context = {
+ .target_dir_fd = -1,
+ };
+ int r;
+
+ /* systemd-stub will wrap sidecar *.cred files from the UEFI kernel image directory into initrd
+ * cpios, so that they unpack into /.extra/. We'll pick them up from there and copy them into /run/
+ * so that we can access them during the entire runtime (note that the initrd file system is erased
+ * during the initrd → host transition). Note that these credentials originate from an untrusted
+ * source (i.e. the ESP typically) and thus need to be authenticated later. We thus put them in a
+ * directory separate from the usual credentials which are from a trusted source. */
+
+ if (!in_initrd())
+ return 0;
+
+ FOREACH_STRING(p,
+ "/.extra/credentials/", /* specific to this boot menu */
+ "/.extra/global_credentials/") { /* boot partition wide */
+
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int source_dir_fd = -1;
+
+ source_dir_fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (source_dir_fd < 0) {
+ if (errno == ENOENT) {
+ log_debug("No credentials passed via %s.", p);
+ continue;
+ }
+
+ log_warning_errno(errno, "Failed to open '%s', ignoring: %m", p);
+ continue;
+ }
+
+ r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '%s' contents, ignoring: %m", p);
+ continue;
+ }
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ const struct dirent *d = de->entries[i];
+ _cleanup_close_ int cfd = -1, nfd = -1;
+ _cleanup_free_ char *n = NULL;
+ const char *e;
+ struct stat st;
+
+ e = endswith(d->d_name, ".cred");
+ if (!e)
+ continue;
+
+ /* drop .cred suffix (which we want in the ESP sidecar dir, but not for our internal
+ * processing) */
+ n = strndup(d->d_name, e - d->d_name);
+ if (!n)
+ return log_oom();
+
+ if (!credential_name_valid(n)) {
+ log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+ continue;
+ }
+
+ cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC);
+ if (cfd < 0) {
+ log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (fstat(cfd, &st) < 0) {
+ log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = stat_verify_regular(&st);
+ if (r < 0) {
+ log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (!credential_size_ok(&context, n, st.st_size))
+ continue;
+
+ r = acquire_encrypted_credential_directory(&context);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(context.target_dir_fd, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, n);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = copy_bytes(cfd, nfd, st.st_size, 0);
+ if (r < 0) {
+ (void) unlinkat(context.target_dir_fd, n, 0);
+ return log_error_errno(r, "Failed to create credential '%s': %m", n);
+ }
+
+ context.size_sum += st.st_size;
+ context.n_credentials++;
+
+ log_debug("Successfully copied boot credential '%s'.", n);
+ }
+ }
+
+ if (context.n_credentials > 0) {
+ log_debug("Imported %u credentials from boot loader.", context.n_credentials);
+
+ r = finalize_credentials_dir(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, "ENCRYPTED_CREDENTIALS_DIRECTORY");
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int acquire_credential_directory(ImportCredentialContext *c) {
+ int r;
+
+ assert(c);
+
+ if (c->target_dir_fd >= 0)
+ return c->target_dir_fd;
+
+ r = path_is_mount_point(SYSTEM_CREDENTIALS_DIRECTORY, NULL, 0);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_error_errno(r, "Failed to determine if " SYSTEM_CREDENTIALS_DIRECTORY " is a mount point: %m");
+
+ r = mkdir_safe_label(SYSTEM_CREDENTIALS_DIRECTORY, 0700, 0, 0, MKDIR_WARN_MODE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create " SYSTEM_CREDENTIALS_DIRECTORY " mount point: %m");
+
+ r = 0; /* Now it exists and is not a mount point */
+ }
+ if (r == 0)
+ /* If not a mountpoint yet, try to mount a ramfs there (so that this stuff isn't swapped
+ * out), but if that doesn't work, let's just use the regular tmpfs it already is. */
+ (void) mount_nofollow_verbose(LOG_WARNING, "ramfs", SYSTEM_CREDENTIALS_DIRECTORY, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
+
+ c->target_dir_fd = open(SYSTEM_CREDENTIALS_DIRECTORY, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (c->target_dir_fd < 0)
+ return log_error_errno(errno, "Failed to open " SYSTEM_CREDENTIALS_DIRECTORY ": %m");
+
+ return c->target_dir_fd;
+}
+
+static int proc_cmdline_callback(const char *key, const char *value, void *data) {
+ ImportCredentialContext *c = ASSERT_PTR(data);
+ _cleanup_free_ char *n = NULL;
+ _cleanup_close_ int nfd = -1;
+ const char *colon;
+ size_t l;
+ int r;
+
+ assert(key);
+
+ if (!proc_cmdline_key_streq(key, "systemd.set_credential"))
+ return 0;
+
+ colon = value ? strchr(value, ':') : NULL;
+ if (!colon) {
+ log_warning("Credential assignment through kernel command line lacks ':' character, ignoring: %s", value);
+ return 0;
+ }
+
+ n = strndup(value, colon - value);
+ if (!n)
+ return log_oom();
+
+ if (!credential_name_valid(n)) {
+ log_warning("Credential name '%s' is invalid, ignoring.", n);
+ return 0;
+ }
+
+ colon++;
+ l = strlen(colon);
+
+ if (!credential_size_ok(c, n, l))
+ return 0;
+
+ r = acquire_credential_directory(c);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, n);
+ if (nfd == -EEXIST)
+ return 0;
+ if (nfd < 0)
+ return nfd;
+
+ r = loop_write(nfd, colon, l, /* do_poll= */ false);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, n, 0);
+ return log_error_errno(r, "Failed to write credential: %m");
+ }
+
+ c->size_sum += l;
+ c->n_credentials++;
+
+ log_debug("Successfully processed kernel command line credential '%s'.", n);
+
+ return 0;
+}
+
+static int import_credentials_proc_cmdline(ImportCredentialContext *c) {
+ int r;
+
+ assert(c);
+
+ r = proc_cmdline_parse(proc_cmdline_callback, c, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse /proc/cmdline: %m");
+
+ return 0;
+}
+
+#define QEMU_FWCFG_PATH "/sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials"
+
+static int import_credentials_qemu(ImportCredentialContext *c) {
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int source_dir_fd = -1;
+ int r;
+
+ assert(c);
+
+ if (detect_container() > 0) /* don't access /sys/ in a container */
+ return 0;
+
+ source_dir_fd = open(QEMU_FWCFG_PATH, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (source_dir_fd < 0) {
+ if (errno == ENOENT) {
+ log_debug("No credentials passed via fw_cfg.");
+ return 0;
+ }
+
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "', ignoring: %m");
+ return 0;
+ }
+
+ r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "' contents, ignoring: %m");
+ return 0;
+ }
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ const struct dirent *d = de->entries[i];
+ _cleanup_close_ int vfd = -1, rfd = -1, nfd = -1;
+ _cleanup_free_ char *szs = NULL;
+ uint64_t sz;
+
+ if (!credential_name_valid(d->d_name)) {
+ log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+ continue;
+ }
+
+ vfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (vfd < 0) {
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = read_virtual_file_at(vfd, "size", LINE_MAX, &szs, NULL);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "'/%s/size, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = safe_atou64(strstrip(szs), &sz);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse size of credential '%s', ignoring: %s", d->d_name, szs);
+ continue;
+ }
+
+ if (!credential_size_ok(c, d->d_name, sz))
+ continue;
+
+ /* Ideally we'd just symlink the data here. Alas the kernel driver exports the raw file as
+ * having size zero, and we'd rather not have applications support such credential
+ * files. Let's hence copy the files to make them regular. */
+
+ rfd = openat(vfd, "raw", O_RDONLY|O_CLOEXEC);
+ if (rfd < 0) {
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/raw, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = acquire_credential_directory(c);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = copy_bytes(rfd, nfd, sz, 0);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, d->d_name, 0);
+ return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name);
+ }
+
+ c->size_sum += sz;
+ c->n_credentials++;
+
+ log_debug("Successfully copied qemu fw_cfg credential '%s'.", d->d_name);
+ }
+
+ return 0;
+}
+
+static int parse_smbios_strings(ImportCredentialContext *c, const char *data, size_t size) {
+ size_t left, skip;
+ const char *p;
+ int r;
+
+ assert(c);
+ assert(data || size == 0);
+
+ /* Unpacks a packed series of SMBIOS OEM vendor strings. These are a series of NUL terminated
+ * strings, one after the other. */
+
+ for (p = data, left = size; left > 0; p += skip, left -= skip) {
+ _cleanup_free_ void *buf = NULL;
+ _cleanup_free_ char *cn = NULL;
+ _cleanup_close_ int nfd = -1;
+ const char *nul, *n, *eq;
+ const void *cdata;
+ size_t buflen, cdata_len;
+ bool unbase64;
+
+ nul = memchr(p, 0, left);
+ if (nul)
+ skip = (nul - p) + 1;
+ else {
+ nul = p + left;
+ skip = left;
+ }
+
+ if (nul - p == 0) /* Skip empty strings */
+ continue;
+
+ /* Only care about strings starting with either of these two prefixes */
+ if ((n = memory_startswith(p, nul - p, "io.systemd.credential:")))
+ unbase64 = false;
+ else if ((n = memory_startswith(p, nul - p, "io.systemd.credential.binary:")))
+ unbase64 = true;
+ else {
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape_length(p, nul - p);
+ log_debug("Ignoring OEM string: %s", strnull(escaped));
+ continue;
+ }
+
+ eq = memchr(n, '=', nul - n);
+ if (!eq) {
+ log_warning("SMBIOS OEM string lacks '=' character, ignoring.");
+ continue;
+ }
+
+ cn = memdup_suffix0(n, eq - n);
+ if (!cn)
+ return log_oom();
+
+ if (!credential_name_valid(cn)) {
+ log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn);
+ continue;
+ }
+
+ /* Optionally base64 decode the data, if requested, to allow binary credentials */
+ if (unbase64) {
+ r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn);
+ continue;
+ }
+
+ cdata = buf;
+ cdata_len = buflen;
+ } else {
+ cdata = eq + 1;
+ cdata_len = nul - (eq + 1);
+ }
+
+ if (!credential_size_ok(c, cn, cdata_len))
+ continue;
+
+ r = acquire_credential_directory(c);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, cn);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = loop_write(nfd, cdata, cdata_len, /* do_poll= */ false);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, cn, 0);
+ return log_error_errno(r, "Failed to write credential: %m");
+ }
+
+ c->size_sum += cdata_len;
+ c->n_credentials++;
+
+ log_debug("Successfully processed SMBIOS credential '%s'.", cn);
+ }
+
+ return 0;
+}
+
+static int import_credentials_smbios(ImportCredentialContext *c) {
+ int r;
+
+ /* Parses DMI OEM strings fields (SMBIOS type 11), as settable with qemu's -smbios type=11,value=… switch. */
+
+ if (detect_container() > 0) /* don't access /sys/ in a container */
+ return 0;
+
+ for (unsigned i = 0;; i++) {
+ struct dmi_field_header {
+ uint8_t type;
+ uint8_t length;
+ uint16_t handle;
+ uint8_t count;
+ char contents[];
+ } _packed_ *dmi_field_header;
+ _cleanup_free_ char *p = NULL;
+ _cleanup_free_ void *data = NULL;
+ size_t size;
+
+ assert_cc(offsetof(struct dmi_field_header, contents) == 5);
+
+ if (asprintf(&p, "/sys/firmware/dmi/entries/11-%u/raw", i) < 0)
+ return log_oom();
+
+ r = read_virtual_file(p, sizeof(dmi_field_header) + CREDENTIALS_TOTAL_SIZE_MAX, (char**) &data, &size);
+ if (r < 0) {
+ /* Once we reach ENOENT there are no more DMI Type 11 fields around. */
+ log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to open '%s', ignoring: %m", p);
+ break;
+ }
+
+ if (size < offsetof(struct dmi_field_header, contents))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "DMI field header of '%s' too short.", p);
+
+ dmi_field_header = data;
+ if (dmi_field_header->type != 11 ||
+ dmi_field_header->length != offsetof(struct dmi_field_header, contents))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid DMI field header.");
+
+ r = parse_smbios_strings(c, dmi_field_header->contents, size - offsetof(struct dmi_field_header, contents));
+ if (r < 0)
+ return r;
+
+ if (i == UINT_MAX) /* Prevent overflow */
+ break;
+ }
+
+ return 0;
+}
+
+static int import_credentials_trusted(void) {
+ _cleanup_(import_credentials_context_free) ImportCredentialContext c = {
+ .target_dir_fd = -1,
+ };
+ int q, w, r;
+
+ r = import_credentials_qemu(&c);
+ w = import_credentials_smbios(&c);
+ q = import_credentials_proc_cmdline(&c);
+
+ if (c.n_credentials > 0) {
+ int z;
+
+ log_debug("Imported %u credentials from kernel command line/smbios/fw_cfg.", c.n_credentials);
+
+ z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY");
+ if (z < 0)
+ return z;
+ }
+
+ return r < 0 ? r : w < 0 ? w : q;
+}
+
+static int symlink_credential_dir(const char *envvar, const char *path, const char *where) {
+ int r;
+
+ assert(envvar);
+ assert(path);
+ assert(where);
+
+ if (!path_is_valid(path) || !path_is_absolute(path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "String specified via $%s is not a valid absolute path, refusing: %s", envvar, path);
+
+ /* If the env var already points to where we intend to create the symlink, then most likely we
+ * already imported some creds earlier, and thus set the env var, and hence don't need to do
+ * anything. */
+ if (path_equal(path, where))
+ return 0;
+
+ r = symlink_idempotent(path, where, /* make_relative= */ true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to link $%s to %s: %m", envvar, where);
+
+ return 0;
+}
+
+int import_credentials(void) {
+ const char *received_creds_dir = NULL, *received_encrypted_creds_dir = NULL;
+ bool envvar_set = false;
+ int r, q;
+
+ r = get_credentials_dir(&received_creds_dir);
+ if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+ log_warning_errno(r, "Failed to determine credentials directory, ignoring: %m");
+
+ envvar_set = r >= 0;
+
+ r = get_encrypted_credentials_dir(&received_encrypted_creds_dir);
+ if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+ log_warning_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+
+ envvar_set = envvar_set || r >= 0;
+
+ if (envvar_set) {
+ /* Maybe an earlier stage initrd already set this up? If so, don't try to import anything again. */
+ log_debug("Not importing credentials, $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY already set.");
+
+ /* But, let's make sure the creds are available from our regular paths. */
+ if (received_creds_dir)
+ r = symlink_credential_dir("CREDENTIALS_DIRECTORY", received_creds_dir, SYSTEM_CREDENTIALS_DIRECTORY);
+ else
+ r = 0;
+
+ if (received_encrypted_creds_dir) {
+ q = symlink_credential_dir("ENCRYPTED_CREDENTIALS_DIRECTORY", received_encrypted_creds_dir, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY);
+ if (r >= 0)
+ r = q;
+ }
+
+ } else {
+ _cleanup_free_ char *v = NULL;
+
+ r = proc_cmdline_get_key("systemd.import_credentials", PROC_CMDLINE_STRIP_RD_PREFIX, &v);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check if 'systemd.import_credentials=' kernel command line option is set, ignoring: %m");
+ else if (r > 0) {
+ r = parse_boolean(v);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse 'systemd.import_credentials=' parameter, ignoring: %m");
+ else if (r == 0) {
+ log_notice("systemd.import_credentials=no is set, skipping importing of credentials.");
+ return 0;
+ }
+ }
+
+ r = import_credentials_boot();
+
+ q = import_credentials_trusted();
+ if (r >= 0)
+ r = q;
+ }
+
+ return r;
+}
diff --git a/src/core/import-creds.h b/src/core/import-creds.h
new file mode 100644
index 0000000..a87865c
--- /dev/null
+++ b/src/core/import-creds.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int import_credentials(void);
diff --git a/src/core/job.c b/src/core/job.c
new file mode 100644
index 0000000..44610f8
--- /dev/null
+++ b/src/core/job.c
@@ -0,0 +1,1679 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "cgroup.h"
+#include "dbus-job.h"
+#include "dbus.h"
+#include "escape.h"
+#include "fileio.h"
+#include "job.h"
+#include "log.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "set.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unit.h"
+#include "virt.h"
+
+Job* job_new_raw(Unit *unit) {
+ Job *j;
+
+ /* used for deserialization */
+
+ assert(unit);
+
+ j = new(Job, 1);
+ if (!j)
+ return NULL;
+
+ *j = (Job) {
+ .manager = unit->manager,
+ .unit = unit,
+ .type = _JOB_TYPE_INVALID,
+ };
+
+ return j;
+}
+
+Job* job_new(Unit *unit, JobType type) {
+ Job *j;
+
+ assert(type < _JOB_TYPE_MAX);
+
+ j = job_new_raw(unit);
+ if (!j)
+ return NULL;
+
+ j->id = j->manager->current_job_id++;
+ j->type = type;
+
+ /* We don't link it here, that's what job_dependency() is for */
+
+ return j;
+}
+
+void job_unlink(Job *j) {
+ assert(j);
+ assert(!j->installed);
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+ assert(!j->subject_list);
+ assert(!j->object_list);
+
+ if (j->in_run_queue) {
+ prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+ j->in_run_queue = false;
+ }
+
+ if (j->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = false;
+ }
+
+ if (j->in_gc_queue) {
+ LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
+ j->in_gc_queue = false;
+ }
+
+ j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+}
+
+Job* job_free(Job *j) {
+ assert(j);
+ assert(!j->installed);
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+ assert(!j->subject_list);
+ assert(!j->object_list);
+
+ job_unlink(j);
+
+ sd_bus_track_unref(j->bus_track);
+ strv_free(j->deserialized_clients);
+
+ activation_details_unref(j->activation_details);
+
+ return mfree(j);
+}
+
+static void job_set_state(Job *j, JobState state) {
+ assert(j);
+ assert(state >= 0);
+ assert(state < _JOB_STATE_MAX);
+
+ if (j->state == state)
+ return;
+
+ j->state = state;
+
+ if (!j->installed)
+ return;
+
+ if (j->state == JOB_RUNNING)
+ j->unit->manager->n_running_jobs++;
+ else {
+ assert(j->state == JOB_WAITING);
+ assert(j->unit->manager->n_running_jobs > 0);
+
+ j->unit->manager->n_running_jobs--;
+
+ if (j->unit->manager->n_running_jobs <= 0)
+ j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source);
+ }
+}
+
+void job_uninstall(Job *j) {
+ Job **pj;
+
+ assert(j->installed);
+
+ job_set_state(j, JOB_WAITING);
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ assert(*pj == j);
+
+ /* Detach from next 'bigger' objects */
+
+ /* daemon-reload should be transparent to job observers */
+ if (!MANAGER_IS_RELOADING(j->manager))
+ bus_job_send_removed_signal(j);
+
+ *pj = NULL;
+
+ unit_add_to_gc_queue(j->unit);
+
+ unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+ hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j);
+ j->installed = false;
+}
+
+static bool job_type_allows_late_merge(JobType t) {
+ /* Tells whether it is OK to merge a job of type 't' with an already
+ * running job.
+ * Reloads cannot be merged this way. Think of the sequence:
+ * 1. Reload of a daemon is in progress; the daemon has already loaded
+ * its config file, but hasn't completed the reload operation yet.
+ * 2. Edit foo's config file.
+ * 3. Trigger another reload to have the daemon use the new config.
+ * Should the second reload job be merged into the first one, the daemon
+ * would not know about the new config.
+ * JOB_RESTART jobs on the other hand can be merged, because they get
+ * patched into JOB_START after stopping the unit. So if we see a
+ * JOB_RESTART running, it means the unit hasn't stopped yet and at
+ * this time the merge is still allowed. */
+ return t != JOB_RELOAD;
+}
+
+static void job_merge_into_installed(Job *j, Job *other) {
+ assert(j->installed);
+ assert(j->unit == other->unit);
+
+ if (j->type != JOB_NOP) {
+ assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0);
+
+ /* Keep the oldest ActivationDetails, if any */
+ if (!j->activation_details)
+ j->activation_details = TAKE_PTR(other->activation_details);
+ } else
+ assert(other->type == JOB_NOP);
+
+ j->irreversible = j->irreversible || other->irreversible;
+ j->ignore_order = j->ignore_order || other->ignore_order;
+}
+
+Job* job_install(Job *j) {
+ Job **pj;
+ Job *uj;
+
+ assert(!j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(j->state == JOB_WAITING);
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ uj = *pj;
+
+ if (uj) {
+ if (job_type_is_conflicting(uj->type, j->type))
+ job_finish_and_invalidate(uj, JOB_CANCELED, false, false);
+ else {
+ /* not conflicting, i.e. mergeable */
+
+ if (uj->state == JOB_WAITING ||
+ (job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) {
+ job_merge_into_installed(uj, j);
+ log_unit_debug(uj->unit,
+ "Merged %s/%s into installed job %s/%s as %"PRIu32,
+ j->unit->id, job_type_to_string(j->type), uj->unit->id,
+ job_type_to_string(uj->type), uj->id);
+ return uj;
+ } else {
+ /* already running and not safe to merge into */
+ /* Patch uj to become a merged job and re-run it. */
+ /* XXX It should be safer to queue j to run after uj finishes, but it is
+ * not currently possible to have more than one installed job per unit. */
+ job_merge_into_installed(uj, j);
+ log_unit_debug(uj->unit,
+ "Merged into running job, re-running: %s/%s as %"PRIu32,
+ uj->unit->id, job_type_to_string(uj->type), uj->id);
+
+ job_set_state(uj, JOB_WAITING);
+ return uj;
+ }
+ }
+ }
+
+ /* Install the job */
+ *pj = j;
+ j->installed = true;
+
+ j->manager->n_installed_jobs++;
+ log_unit_debug(j->unit,
+ "Installed new job %s/%s as %u",
+ j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+
+ job_add_to_gc_queue(j);
+
+ job_add_to_dbus_queue(j); /* announce this job to clients */
+ unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+ return j;
+}
+
+int job_install_deserialized(Job *j) {
+ Job **pj;
+ int r;
+
+ assert(!j->installed);
+
+ if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION)
+ return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL),
+ "Invalid job type %s in deserialization.",
+ strna(job_type_to_string(j->type)));
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ if (*pj)
+ return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST),
+ "Unit already has a job installed. Not installing deserialized job.");
+
+ r = hashmap_ensure_put(&j->manager->jobs, NULL, UINT32_TO_PTR(j->id), j);
+ if (r == -EEXIST)
+ return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id);
+ if (r < 0)
+ return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m");
+
+ *pj = j;
+ j->installed = true;
+
+ if (j->state == JOB_RUNNING)
+ j->unit->manager->n_running_jobs++;
+
+ log_unit_debug(j->unit,
+ "Reinstalled deserialized job %s/%s as %u",
+ j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+ return 0;
+}
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) {
+ JobDependency *l;
+
+ assert(object);
+
+ /* Adds a new job link, which encodes that the 'subject' job
+ * needs the 'object' job in some way. If 'subject' is NULL
+ * this means the 'anchor' job (i.e. the one the user
+ * explicitly asked for) is the requester. */
+
+ l = new0(JobDependency, 1);
+ if (!l)
+ return NULL;
+
+ l->subject = subject;
+ l->object = object;
+ l->matters = matters;
+ l->conflicts = conflicts;
+
+ if (subject)
+ LIST_PREPEND(subject, subject->subject_list, l);
+
+ LIST_PREPEND(object, object->object_list, l);
+
+ return l;
+}
+
+void job_dependency_free(JobDependency *l) {
+ assert(l);
+
+ if (l->subject)
+ LIST_REMOVE(subject, l->subject->subject_list, l);
+
+ LIST_REMOVE(object, l->object->object_list, l);
+
+ free(l);
+}
+
+void job_dump(Job *j, FILE *f, const char *prefix) {
+ assert(j);
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%s-> Job %u:\n"
+ "%s\tAction: %s -> %s\n"
+ "%s\tState: %s\n"
+ "%s\tIrreversible: %s\n"
+ "%s\tMay GC: %s\n",
+ prefix, j->id,
+ prefix, j->unit->id, job_type_to_string(j->type),
+ prefix, job_state_to_string(j->state),
+ prefix, yes_no(j->irreversible),
+ prefix, yes_no(job_may_gc(j)));
+}
+
+/*
+ * Merging is commutative, so imagine the matrix as symmetric. We store only
+ * its lower triangle to avoid duplication. We don't store the main diagonal,
+ * because A merged with A is simply A.
+ *
+ * If the resulting type is collapsed immediately afterwards (to get rid of
+ * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain),
+ * the following properties hold:
+ *
+ * Merging is associative! A merged with B, and then merged with C is the same
+ * as A merged with the result of B merged with C.
+ *
+ * Mergeability is transitive! If A can be merged with B and B with C then
+ * A also with C.
+ *
+ * Also, if A merged with B cannot be merged with C, then either A or B cannot
+ * be merged with C either.
+ */
+static const JobType job_merging_table[] = {
+/* What \ With * JOB_START JOB_VERIFY_ACTIVE JOB_STOP JOB_RELOAD */
+/*********************************************************************************/
+/*JOB_START */
+/*JOB_VERIFY_ACTIVE */ JOB_START,
+/*JOB_STOP */ -1, -1,
+/*JOB_RELOAD */ JOB_RELOAD_OR_START, JOB_RELOAD, -1,
+/*JOB_RESTART */ JOB_RESTART, JOB_RESTART, -1, JOB_RESTART,
+};
+
+JobType job_type_lookup_merge(JobType a, JobType b) {
+ assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2);
+ assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING);
+ assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING);
+
+ if (a == b)
+ return a;
+
+ if (a < b) {
+ JobType tmp = a;
+ a = b;
+ b = tmp;
+ }
+
+ return job_merging_table[(a - 1) * a / 2 + b];
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) {
+ switch (a) {
+
+ case JOB_START:
+ return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+ case JOB_STOP:
+ return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED);
+
+ case JOB_VERIFY_ACTIVE:
+ return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+ case JOB_RELOAD:
+ return
+ b == UNIT_RELOADING;
+
+ case JOB_RESTART:
+ /* Restart jobs must always be kept.
+ *
+ * For ACTIVE/RELOADING units, this is obvious.
+ *
+ * For ACTIVATING units, it's more subtle:
+ *
+ * Generally, if a service Requires= another unit, restarts of
+ * the unit must be propagated to the service. If the service is
+ * ACTIVATING, it must still be restarted since it might have
+ * stale information regarding the other unit.
+ *
+ * For example, consider a service that Requires= a socket: if
+ * the socket is restarted, but the service is still ACTIVATING,
+ * it's necessary to restart the service so that it gets the new
+ * socket. */
+ return false;
+
+ case JOB_NOP:
+ return true;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+JobType job_type_collapse(JobType t, Unit *u) {
+ UnitActiveState s;
+
+ switch (t) {
+
+ case JOB_TRY_RESTART:
+ /* Be sure to keep the restart job even if the unit is
+ * ACTIVATING.
+ *
+ * See the job_type_is_redundant(JOB_RESTART) for more info */
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_ACTIVATING(s))
+ return JOB_NOP;
+
+ return JOB_RESTART;
+
+ case JOB_TRY_RELOAD:
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+ return JOB_NOP;
+
+ return JOB_RELOAD;
+
+ case JOB_RELOAD_OR_START:
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+ return JOB_START;
+
+ return JOB_RELOAD;
+
+ default:
+ return t;
+ }
+}
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) {
+ JobType t;
+
+ t = job_type_lookup_merge(*a, b);
+ if (t < 0)
+ return -EEXIST;
+
+ *a = job_type_collapse(t, u);
+ return 0;
+}
+
+static bool job_is_runnable(Job *j) {
+ Unit *other;
+
+ assert(j);
+ assert(j->installed);
+
+ /* Checks whether there is any job running for the units this
+ * job needs to be running after (in the case of a 'positive'
+ * job type) or before (in the case of a 'negative' job
+ * type. */
+
+ /* Note that unit types have a say in what is runnable,
+ * too. For example, if they return -EAGAIN from
+ * unit_start() they can indicate they are not
+ * runnable yet. */
+
+ /* First check if there is an override */
+ if (j->ignore_order)
+ return true;
+
+ if (j->type == JOB_NOP)
+ return true;
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) > 0) {
+ log_unit_debug(j->unit,
+ "starting held back, waiting for: %s",
+ other->id);
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) > 0) {
+ log_unit_debug(j->unit,
+ "stopping held back, waiting for: %s",
+ other->id);
+ return false;
+ }
+
+ return true;
+}
+
+static void job_change_type(Job *j, JobType newtype) {
+ assert(j);
+
+ log_unit_debug(j->unit,
+ "Converting job %s/%s -> %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ j->unit->id, job_type_to_string(newtype));
+
+ j->type = newtype;
+}
+
+static const char* job_start_message_format(Unit *u, JobType t) {
+ assert(u);
+ assert(IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD));
+
+ if (t == JOB_RELOAD)
+ return "Reloading %s...";
+ else if (t == JOB_START)
+ return UNIT_VTABLE(u)->status_message_formats.starting_stopping[0] ?: "Starting %s...";
+ else
+ return UNIT_VTABLE(u)->status_message_formats.starting_stopping[1] ?: "Stopping %s...";
+}
+
+static void job_emit_start_message(Unit *u, uint32_t job_id, JobType t) {
+ _cleanup_free_ char *free_ident = NULL;
+ const char *ident, *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+ assert(u->id); /* We better don't try to run a unit that doesn't even have an id. */
+
+ if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD))
+ return;
+
+ if (!unit_log_level_test(u, LOG_INFO))
+ return;
+
+ format = job_start_message_format(u, t);
+ ident = unit_status_string(u, &free_ident);
+
+ bool do_console = t != JOB_RELOAD;
+ bool console_only = do_console && log_on_console(); /* Reload status messages have traditionally
+ * not been printed to the console. */
+
+ /* Print to the log first. */
+ if (!console_only) { /* Skip this if it would only go on the console anyway */
+
+ const char *mid =
+ t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR :
+ t == JOB_STOP ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR :
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR;
+ const char *msg_fmt = strjoina("MESSAGE=", format);
+
+ /* Note that we deliberately use LOG_MESSAGE() instead of LOG_UNIT_MESSAGE() here, since this
+ * is supposed to mimic closely what is written to screen using the status output, which is
+ * supposed to be high level friendly output. */
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_unit_struct(u, LOG_INFO,
+ msg_fmt, ident,
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ REENABLE_WARNING;
+ }
+
+ /* Log to the console second. */
+ if (do_console) {
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ unit_status_printf(u, STATUS_TYPE_NORMAL, "", format, ident);
+ REENABLE_WARNING;
+ }
+}
+
+static const char* job_done_message_format(Unit *u, JobType t, JobResult result) {
+ static const char* const generic_finished_start_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Started %s.",
+ [JOB_TIMEOUT] = "Timed out starting %s.",
+ [JOB_FAILED] = "Failed to start %s.",
+ [JOB_DEPENDENCY] = "Dependency failed for %s.",
+ [JOB_ASSERT] = "Assertion failed for %s.",
+ [JOB_UNSUPPORTED] = "Starting of %s unsupported.",
+ [JOB_COLLECTED] = "Unnecessary job was removed for %s.",
+ [JOB_ONCE] = "Unit %s has been started before and cannot be started again.",
+ };
+ static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Stopped %s.",
+ [JOB_FAILED] = "Stopped %s with error.",
+ [JOB_TIMEOUT] = "Timed out stopping %s.",
+ };
+ static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Reloaded %s.",
+ [JOB_FAILED] = "Reload failed for %s.",
+ [JOB_TIMEOUT] = "Timed out reloading %s.",
+ };
+ /* When verify-active detects the unit is inactive, report it.
+ * Most likely a DEPEND warning from a requisiting unit will
+ * occur next and it's nice to see what was requisited. */
+ static const char* const generic_finished_verify_active_job[_JOB_RESULT_MAX] = {
+ [JOB_SKIPPED] = "%s is inactive.",
+ };
+ const char *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ /* Show condition check message if the job did not actually do anything due to unmet condition. */
+ if (t == JOB_START && result == JOB_DONE && !u->condition_result)
+ return "Condition check resulted in %s being skipped.";
+
+ if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) {
+ const UnitStatusMessageFormats *formats = &UNIT_VTABLE(u)->status_message_formats;
+ if (formats->finished_job) {
+ format = formats->finished_job(u, t, result);
+ if (format)
+ return format;
+ }
+
+ format = (t == JOB_START ? formats->finished_start_job : formats->finished_stop_job)[result];
+ if (format)
+ return format;
+ }
+
+ /* Return generic strings */
+ switch (t) {
+ case JOB_START:
+ return generic_finished_start_job[result];
+ case JOB_STOP:
+ case JOB_RESTART:
+ return generic_finished_stop_job[result];
+ case JOB_RELOAD:
+ return generic_finished_reload_job[result];
+ case JOB_VERIFY_ACTIVE:
+ return generic_finished_verify_active_job[result];
+ default:
+ return NULL;
+ }
+}
+
+static const struct {
+ int log_level;
+ const char *color, *word;
+} job_done_messages[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = { LOG_INFO, ANSI_OK_COLOR, " OK " },
+ [JOB_CANCELED] = { LOG_INFO, },
+ [JOB_TIMEOUT] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " TIME " },
+ [JOB_FAILED] = { LOG_ERR, ANSI_HIGHLIGHT_RED, "FAILED" },
+ [JOB_DEPENDENCY] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "DEPEND" },
+ [JOB_SKIPPED] = { LOG_NOTICE, ANSI_HIGHLIGHT, " INFO " },
+ [JOB_INVALID] = { LOG_INFO, },
+ [JOB_ASSERT] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "ASSERT" },
+ [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" },
+ [JOB_COLLECTED] = { LOG_INFO, },
+ [JOB_ONCE] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " ONCE " },
+};
+
+static const char* job_done_mid(JobType type, JobResult result) {
+ switch (type) {
+ case JOB_START:
+ if (result == JOB_DONE)
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR;
+ else
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR;
+
+ case JOB_RELOAD:
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR;
+
+ case JOB_STOP:
+ case JOB_RESTART:
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR;
+
+ default:
+ return NULL;
+ }
+}
+
+static void job_emit_done_message(Unit *u, uint32_t job_id, JobType t, JobResult result) {
+ _cleanup_free_ char *free_ident = NULL;
+ const char *ident, *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ if (!unit_log_level_test(u, job_done_messages[result].log_level))
+ return;
+
+ format = job_done_message_format(u, t, result);
+ if (!format)
+ return;
+
+ ident = unit_status_string(u, &free_ident);
+
+ const char *status = job_done_messages[result].word;
+ bool do_console = t != JOB_RELOAD && status;
+ bool console_only = do_console && log_on_console();
+
+ if (t == JOB_START && result == JOB_DONE && !u->condition_result) {
+ /* No message on the console if the job did not actually do anything due to unmet condition. */
+ if (console_only)
+ return;
+ else
+ do_console = false;
+ }
+
+ if (!console_only) { /* Skip printing if output goes to the console, and job_print_status_message()
+ * will actually print something to the console. */
+ Condition *c;
+ const char *mid = job_done_mid(t, result); /* mid may be NULL. log_unit_struct() will ignore it. */
+
+ c = t == JOB_START && result == JOB_DONE ? unit_find_failed_condition(u) : NULL;
+ if (c) {
+ /* Special case units that were skipped because of a unmet condition check so that
+ * we can add more information to the message. */
+ if (c->trigger)
+ log_unit_struct(
+ u,
+ job_done_messages[result].log_level,
+ LOG_MESSAGE("%s was skipped because no trigger condition checks were met.",
+ ident),
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ else
+ log_unit_struct(
+ u,
+ job_done_messages[result].log_level,
+ LOG_MESSAGE("%s was skipped because of an unmet condition check (%s=%s%s).",
+ ident,
+ condition_type_to_string(c->type),
+ c->negate ? "!" : "",
+ c->parameter),
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ } else {
+ const char *msg_fmt = strjoina("MESSAGE=", format);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_unit_struct(u, job_done_messages[result].log_level,
+ msg_fmt, ident,
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ REENABLE_WARNING;
+ }
+ }
+
+ if (do_console) {
+ if (log_get_show_color())
+ status = strjoina(job_done_messages[result].color,
+ status,
+ ANSI_NORMAL);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ unit_status_printf(u,
+ result == JOB_DONE ? STATUS_TYPE_NORMAL : STATUS_TYPE_NOTICE,
+ status, format, ident);
+ REENABLE_WARNING;
+
+ if (t == JOB_START && result == JOB_FAILED) {
+ _cleanup_free_ char *quoted = NULL;
+
+ quoted = shell_maybe_quote(u->id, 0);
+ if (quoted)
+ manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL,
+ "See 'systemctl status %s' for details.", quoted);
+ }
+ }
+}
+
+static int job_perform_on_unit(Job **j) {
+ ActivationDetails *a;
+ uint32_t id;
+ Manager *m;
+ JobType t;
+ Unit *u;
+ bool wait_only;
+ int r;
+
+ /* While we execute this operation the job might go away (for example: because it finishes immediately
+ * or is replaced by a new, conflicting job). To make sure we don't access a freed job later on we
+ * store the id here, so that we can verify the job is still valid. */
+
+ assert(j);
+ assert(*j);
+
+ m = (*j)->manager;
+ u = (*j)->unit;
+ t = (*j)->type;
+ id = (*j)->id;
+ a = (*j)->activation_details;
+
+ switch (t) {
+ case JOB_START:
+ r = unit_start(u, a);
+ wait_only = r == -EBADR; /* If the unit type does not support starting, then simply wait. */
+ break;
+
+ case JOB_RESTART:
+ t = JOB_STOP;
+ _fallthrough_;
+ case JOB_STOP:
+ r = unit_stop(u);
+ wait_only = r == -EBADR; /* If the unit type does not support stopping, then simply wait. */
+ break;
+
+ case JOB_RELOAD:
+ r = unit_reload(u);
+ wait_only = false; /* A clear error is generated if reload is not supported. */
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Log if the job still exists and the start/stop/reload function actually did something or we're
+ * only waiting for unit status change (common for device units). The latter ensures that job start
+ * messages for device units are correctly shown. Note that if the job disappears too quickly, e.g.
+ * for units for which there's no 'activating' phase (i.e. because we transition directly from
+ * 'inactive' to 'active'), we'll possibly skip the "Starting..." message. */
+ *j = manager_get_job(m, id);
+ if (*j && (r > 0 || wait_only))
+ job_emit_start_message(u, id, t);
+
+ return wait_only ? 0 : r;
+}
+
+int job_run_and_invalidate(Job *j) {
+ int r;
+
+ assert(j);
+ assert(j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(j->in_run_queue);
+
+ prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+ j->in_run_queue = false;
+
+ if (j->state != JOB_WAITING)
+ return 0;
+
+ if (!job_is_runnable(j))
+ return -EAGAIN;
+
+ job_start_timer(j, true);
+ job_set_state(j, JOB_RUNNING);
+ job_add_to_dbus_queue(j);
+
+ switch (j->type) {
+
+ case JOB_VERIFY_ACTIVE: {
+ UnitActiveState t;
+
+ t = unit_active_state(j->unit);
+ if (UNIT_IS_ACTIVE_OR_RELOADING(t))
+ r = -EALREADY;
+ else if (t == UNIT_ACTIVATING)
+ r = -EAGAIN;
+ else
+ r = -EBADR;
+ break;
+ }
+
+ case JOB_START:
+ case JOB_STOP:
+ case JOB_RESTART:
+ case JOB_RELOAD:
+ r = job_perform_on_unit(&j);
+ break;
+
+ case JOB_NOP:
+ r = -EALREADY;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (j) {
+ if (r == -EAGAIN)
+ job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */
+ else if (r == -EALREADY) /* already being executed */
+ r = job_finish_and_invalidate(j, JOB_DONE, true, true);
+ else if (r == -ECOMM)
+ r = job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (r == -EBADR)
+ r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false);
+ else if (r == -ENOEXEC)
+ r = job_finish_and_invalidate(j, JOB_INVALID, true, false);
+ else if (r == -EPROTO)
+ r = job_finish_and_invalidate(j, JOB_ASSERT, true, false);
+ else if (r == -EOPNOTSUPP)
+ r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false);
+ else if (r == -ENOLINK)
+ r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+ else if (r == -ESTALE)
+ r = job_finish_and_invalidate(j, JOB_ONCE, true, false);
+ else if (r < 0)
+ r = job_finish_and_invalidate(j, JOB_FAILED, true, false);
+ }
+
+ return r;
+}
+
+static void job_fail_dependencies(Unit *u, UnitDependencyAtom match_atom) {
+ Unit *other;
+
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, match_atom) {
+ Job *j = other->job;
+
+ if (!j)
+ continue;
+ if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE))
+ continue;
+
+ job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+ }
+}
+
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
+ Unit *u, *other;
+ JobType t;
+
+ assert(j);
+ assert(j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+
+ u = j->unit;
+ t = j->type;
+
+ j->result = result;
+
+ log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s",
+ j->id, u->id, job_type_to_string(t), job_result_to_string(result));
+
+ /* If this job did nothing to the respective unit we don't log the status message */
+ if (!already)
+ job_emit_done_message(u, j->id, t, result);
+
+ /* Patch restart jobs so that they become normal start jobs */
+ if (result == JOB_DONE && t == JOB_RESTART) {
+
+ job_change_type(j, JOB_START);
+ job_set_state(j, JOB_WAITING);
+
+ job_add_to_dbus_queue(j);
+ job_add_to_run_queue(j);
+ job_add_to_gc_queue(j);
+
+ goto finish;
+ }
+
+ if (IN_SET(result, JOB_FAILED, JOB_INVALID))
+ j->manager->n_failed_jobs++;
+
+ job_uninstall(j);
+ job_free(j);
+
+ /* Fail depending jobs on failure */
+ if (result != JOB_DONE && recursive) {
+ if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE))
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_START_FAILURE);
+ else if (t == JOB_STOP)
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_STOP_FAILURE);
+ }
+
+ /* A special check to make sure we take down anything RequisiteOf= if we aren't active. This is when
+ * the verify-active job merges with a satisfying job type, and then loses its invalidation effect,
+ * as the result there is JOB_DONE for the start job we merged into, while we should be failing the
+ * depending job if the said unit isn't in fact active. Oneshots are an example of this, where going
+ * directly from activating to inactive is success.
+ *
+ * This happens when you use ConditionXYZ= in a unit too, since in that case the job completes with
+ * the JOB_DONE result, but the unit never really becomes active. Note that such a case still
+ * involves merging:
+ *
+ * A start job waits for something else, and a verify-active comes in and merges in the installed
+ * job. Then, later, when it becomes runnable, it finishes with JOB_DONE result as execution on
+ * conditions not being met is skipped, breaking our dependency semantics.
+ *
+ * Also, depending on if start job waits or not, the merging may or may not happen (the verify-active
+ * job may trigger after it finishes), so you get undeterministic results without this check.
+ */
+ if (result == JOB_DONE && recursive &&
+ IN_SET(t, JOB_START, JOB_RELOAD) &&
+ !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE);
+
+ /* Trigger OnFailure= dependencies that are not generated by the unit itself. We don't treat
+ * JOB_CANCELED as failure in this context. And JOB_FAILURE is already handled by the unit itself. */
+ if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) {
+ log_unit_struct(u, LOG_NOTICE,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.",
+ u->id,
+ job_type_to_string(t),
+ job_result_to_string(result)));
+
+ unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+ }
+
+ unit_trigger_notify(u);
+
+finish:
+ /* Try to start the next jobs that can be started */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_AFTER)
+ if (other->job) {
+ job_add_to_run_queue(other->job);
+ job_add_to_gc_queue(other->job);
+ }
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_BEFORE)
+ if (other->job) {
+ job_add_to_run_queue(other->job);
+ job_add_to_gc_queue(other->job);
+ }
+
+ /* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still
+ * necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */
+ unit_submit_to_start_when_upheld_queue(u);
+ unit_submit_to_stop_when_bound_queue(u);
+ unit_submit_to_stop_when_unneeded_queue(u);
+
+ manager_check_finished(u->manager);
+
+ return 0;
+}
+
+static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) {
+ Job *j = ASSERT_PTR(userdata);
+ Unit *u;
+
+ assert(s == j->timer_event_source);
+
+ log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type));
+
+ u = j->unit;
+ job_finish_and_invalidate(j, JOB_TIMEOUT, true, false);
+
+ emergency_action(u->manager, u->job_timeout_action,
+ EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+ u->job_timeout_reboot_arg, -1, "job timed out");
+
+ return 0;
+}
+
+int job_start_timer(Job *j, bool job_running) {
+ int r;
+ usec_t timeout_time, old_timeout_time;
+
+ if (job_running) {
+ j->begin_running_usec = now(CLOCK_MONOTONIC);
+
+ if (j->unit->job_running_timeout == USEC_INFINITY)
+ return 0;
+
+ timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout);
+
+ if (j->timer_event_source) {
+ /* Update only if JobRunningTimeoutSec= results in earlier timeout */
+ r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time);
+ if (r < 0)
+ return r;
+
+ if (old_timeout_time <= timeout_time)
+ return 0;
+
+ return sd_event_source_set_time(j->timer_event_source, timeout_time);
+ }
+ } else {
+ if (j->timer_event_source)
+ return 0;
+
+ j->begin_usec = now(CLOCK_MONOTONIC);
+
+ if (j->unit->job_timeout == USEC_INFINITY)
+ return 0;
+
+ timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+ }
+
+ r = sd_event_add_time(
+ j->manager->event,
+ &j->timer_event_source,
+ CLOCK_MONOTONIC,
+ timeout_time, 0,
+ job_dispatch_timer, j);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(j->timer_event_source, "job-start");
+
+ return 0;
+}
+
+void job_add_to_run_queue(Job *j) {
+ int r;
+
+ assert(j);
+ assert(j->installed);
+
+ if (j->in_run_queue)
+ return;
+
+ r = prioq_put(j->manager->run_queue, j, &j->run_queue_idx);
+ if (r < 0)
+ log_warning_errno(r, "Failed put job in run queue, ignoring: %m");
+ else
+ j->in_run_queue = true;
+
+ manager_trigger_run_queue(j->manager);
+}
+
+void job_add_to_dbus_queue(Job *j) {
+ assert(j);
+ assert(j->installed);
+
+ if (j->in_dbus_queue)
+ return;
+
+ /* We don't check if anybody is subscribed here, since this
+ * job might just have been created and not yet assigned to a
+ * connection/client. */
+
+ LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = true;
+}
+
+char *job_dbus_path(Job *j) {
+ char *p;
+
+ assert(j);
+
+ if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0)
+ return NULL;
+
+ return p;
+}
+
+int job_serialize(Job *j, FILE *f) {
+ assert(j);
+ assert(f);
+
+ (void) serialize_item_format(f, "job-id", "%u", j->id);
+ (void) serialize_item(f, "job-type", job_type_to_string(j->type));
+ (void) serialize_item(f, "job-state", job_state_to_string(j->state));
+ (void) serialize_bool(f, "job-irreversible", j->irreversible);
+ (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal);
+ (void) serialize_bool(f, "job-ignore-order", j->ignore_order);
+
+ if (j->begin_usec > 0)
+ (void) serialize_usec(f, "job-begin", j->begin_usec);
+ if (j->begin_running_usec > 0)
+ (void) serialize_usec(f, "job-begin-running", j->begin_running_usec);
+
+ bus_track_serialize(j->bus_track, f, "subscribed");
+
+ activation_details_serialize(j->activation_details, f);
+
+ /* End marker */
+ fputc('\n', f);
+ return 0;
+}
+
+int job_deserialize(Job *j, FILE *f) {
+ int r;
+
+ assert(j);
+ assert(f);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *l, *v;
+ size_t k;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ return 0;
+
+ l = strstrip(line);
+
+ /* End marker */
+ if (isempty(l))
+ return 0;
+
+ k = strcspn(l, "=");
+
+ if (l[k] == '=') {
+ l[k] = 0;
+ v = l+k+1;
+ } else
+ v = l+k;
+
+ if (streq(l, "job-id")) {
+
+ if (safe_atou32(v, &j->id) < 0)
+ log_debug("Failed to parse job id value: %s", v);
+
+ } else if (streq(l, "job-type")) {
+ JobType t;
+
+ t = job_type_from_string(v);
+ if (t < 0)
+ log_debug("Failed to parse job type: %s", v);
+ else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION)
+ log_debug("Cannot deserialize job of type: %s", v);
+ else
+ j->type = t;
+
+ } else if (streq(l, "job-state")) {
+ JobState s;
+
+ s = job_state_from_string(v);
+ if (s < 0)
+ log_debug("Failed to parse job state: %s", v);
+ else
+ job_set_state(j, s);
+
+ } else if (streq(l, "job-irreversible")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job irreversible flag: %s", v);
+ else
+ j->irreversible = j->irreversible || b;
+
+ } else if (streq(l, "job-sent-dbus-new-signal")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v);
+ else
+ j->sent_dbus_new_signal = j->sent_dbus_new_signal || b;
+
+ } else if (streq(l, "job-ignore-order")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job ignore_order flag: %s", v);
+ else
+ j->ignore_order = j->ignore_order || b;
+
+ } else if (streq(l, "job-begin"))
+ (void) deserialize_usec(v, &j->begin_usec);
+
+ else if (streq(l, "job-begin-running"))
+ (void) deserialize_usec(v, &j->begin_running_usec);
+
+ else if (streq(l, "subscribed")) {
+ if (strv_extend(&j->deserialized_clients, v) < 0)
+ return log_oom();
+
+ } else if (startswith(l, "activation-details")) {
+ if (activation_details_deserialize(l, v, &j->activation_details) < 0)
+ log_debug("Failed to parse job ActivationDetails element: %s", v);
+
+ } else
+ log_debug("Unknown job serialization key: %s", l);
+ }
+}
+
+int job_coldplug(Job *j) {
+ int r;
+ usec_t timeout_time = USEC_INFINITY;
+
+ assert(j);
+
+ /* After deserialization is complete and the bus connection
+ * set up again, let's start watching our subscribers again */
+ (void) bus_job_coldplug_bus_track(j);
+
+ if (j->state == JOB_WAITING)
+ job_add_to_run_queue(j);
+
+ /* Maybe due to new dependencies we don't actually need this job anymore? */
+ job_add_to_gc_queue(j);
+
+ /* Create timer only when job began or began running and the respective timeout is finite.
+ * Follow logic of job_start_timer() if both timeouts are finite */
+ if (j->begin_usec == 0)
+ return 0;
+
+ if (j->unit->job_timeout != USEC_INFINITY)
+ timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+
+ if (timestamp_is_set(j->begin_running_usec))
+ timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout));
+
+ if (timeout_time == USEC_INFINITY)
+ return 0;
+
+ j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+
+ r = sd_event_add_time(
+ j->manager->event,
+ &j->timer_event_source,
+ CLOCK_MONOTONIC,
+ timeout_time, 0,
+ job_dispatch_timer, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to restart timeout for job: %m");
+
+ (void) sd_event_source_set_description(j->timer_event_source, "job-timeout");
+
+ return r;
+}
+
+void job_shutdown_magic(Job *j) {
+ assert(j);
+
+ /* The shutdown target gets some special treatment here: we
+ * tell the kernel to begin with flushing its disk caches, to
+ * optimize shutdown time a bit. Ideally we wouldn't hardcode
+ * this magic into PID 1. However all other processes aren't
+ * options either since they'd exit much sooner than PID 1 and
+ * asynchronous sync() would cause their exit to be
+ * delayed. */
+
+ if (j->type != JOB_START)
+ return;
+
+ if (!MANAGER_IS_SYSTEM(j->unit->manager))
+ return;
+
+ if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET))
+ return;
+
+ /* In case messages on console has been disabled on boot */
+ j->unit->manager->no_console_output = false;
+
+ manager_invalidate_startup_units(j->unit->manager);
+
+ if (detect_container() > 0)
+ return;
+
+ (void) asynchronous_sync(NULL);
+}
+
+int job_get_timeout(Job *j, usec_t *timeout) {
+ usec_t x = USEC_INFINITY, y = USEC_INFINITY;
+ Unit *u = ASSERT_PTR(ASSERT_PTR(j)->unit);
+ int r;
+
+ if (j->timer_event_source) {
+ r = sd_event_source_get_time(j->timer_event_source, &x);
+ if (r < 0)
+ return r;
+ }
+
+ if (UNIT_VTABLE(u)->get_timeout) {
+ r = UNIT_VTABLE(u)->get_timeout(u, &y);
+ if (r < 0)
+ return r;
+ }
+
+ if (x == USEC_INFINITY && y == USEC_INFINITY)
+ return 0;
+
+ *timeout = MIN(x, y);
+ return 1;
+}
+
+bool job_may_gc(Job *j) {
+ Unit *other;
+
+ assert(j);
+
+ /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their
+ * own and just track external state. For now the only unit type that qualifies for this are .device units.
+ * Returns true if the job can be collected. */
+
+ if (!UNIT_VTABLE(j->unit)->gc_jobs)
+ return false;
+
+ if (sd_bus_track_count(j->bus_track) > 0)
+ return false;
+
+ /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections
+ * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once
+ * referenced by one, and reset this whenever we notice that no private bus connections are around. This means
+ * the GC is a bit too conservative when it comes to jobs created by private bus connections. */
+ if (j->ref_by_private_bus) {
+ if (set_isempty(j->unit->manager->private_buses))
+ j->ref_by_private_bus = false;
+ else
+ return false;
+ }
+
+ if (j->type == JOB_NOP)
+ return false;
+
+ /* The logic is inverse to job_is_runnable, we cannot GC as long as we block any job. */
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) < 0)
+ return false;
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) < 0)
+ return false;
+
+ return true;
+}
+
+void job_add_to_gc_queue(Job *j) {
+ assert(j);
+
+ if (j->in_gc_queue)
+ return;
+
+ if (!job_may_gc(j))
+ return;
+
+ LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j);
+ j->in_gc_queue = true;
+}
+
+static int job_compare_id(Job * const *a, Job * const *b) {
+ return CMP((*a)->id, (*b)->id);
+}
+
+static size_t sort_job_list(Job **list, size_t n) {
+ Job *previous = NULL;
+ size_t a, b;
+
+ /* Order by numeric IDs */
+ typesafe_qsort(list, n, job_compare_id);
+
+ /* Filter out duplicates */
+ for (a = 0, b = 0; a < n; a++) {
+
+ if (previous == list[a])
+ continue;
+
+ previous = list[b++] = list[a];
+ }
+
+ return b;
+}
+
+int job_get_before(Job *j, Job*** ret) {
+ _cleanup_free_ Job** list = NULL;
+ Unit *other = NULL;
+ size_t n = 0;
+
+ /* Returns a list of all pending jobs that need to finish before this job may be started. */
+
+ assert(j);
+ assert(ret);
+
+ if (j->ignore_order) {
+ *ret = NULL;
+ return 0;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+ if (!other->job)
+ continue;
+ if (job_compare(j, other->job, UNIT_ATOM_AFTER) <= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+ if (!other->job)
+ continue;
+ if (job_compare(j, other->job, UNIT_ATOM_BEFORE) <= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ n = sort_job_list(list, n);
+
+ *ret = TAKE_PTR(list);
+
+ return (int) n;
+}
+
+int job_get_after(Job *j, Job*** ret) {
+ _cleanup_free_ Job** list = NULL;
+ Unit *other = NULL;
+ size_t n = 0;
+
+ assert(j);
+ assert(ret);
+
+ /* Returns a list of all pending jobs that are waiting for this job to finish. */
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+ if (!other->job)
+ continue;
+
+ if (other->job->ignore_order)
+ continue;
+
+ if (job_compare(j, other->job, UNIT_ATOM_BEFORE) >= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+ if (!other->job)
+ continue;
+
+ if (other->job->ignore_order)
+ continue;
+
+ if (job_compare(j, other->job, UNIT_ATOM_AFTER) >= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ n = sort_job_list(list, n);
+
+ *ret = TAKE_PTR(list);
+
+ return (int) n;
+}
+
+static const char* const job_state_table[_JOB_STATE_MAX] = {
+ [JOB_WAITING] = "waiting",
+ [JOB_RUNNING] = "running",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_state, JobState);
+
+static const char* const job_type_table[_JOB_TYPE_MAX] = {
+ [JOB_START] = "start",
+ [JOB_VERIFY_ACTIVE] = "verify-active",
+ [JOB_STOP] = "stop",
+ [JOB_RELOAD] = "reload",
+ [JOB_RELOAD_OR_START] = "reload-or-start",
+ [JOB_RESTART] = "restart",
+ [JOB_TRY_RESTART] = "try-restart",
+ [JOB_TRY_RELOAD] = "try-reload",
+ [JOB_NOP] = "nop",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_type, JobType);
+
+static const char* const job_mode_table[_JOB_MODE_MAX] = {
+ [JOB_FAIL] = "fail",
+ [JOB_REPLACE] = "replace",
+ [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly",
+ [JOB_ISOLATE] = "isolate",
+ [JOB_FLUSH] = "flush",
+ [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies",
+ [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements",
+ [JOB_TRIGGERING] = "triggering",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode);
+
+static const char* const job_result_table[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "done",
+ [JOB_CANCELED] = "canceled",
+ [JOB_TIMEOUT] = "timeout",
+ [JOB_FAILED] = "failed",
+ [JOB_DEPENDENCY] = "dependency",
+ [JOB_SKIPPED] = "skipped",
+ [JOB_INVALID] = "invalid",
+ [JOB_ASSERT] = "assert",
+ [JOB_UNSUPPORTED] = "unsupported",
+ [JOB_COLLECTED] = "collected",
+ [JOB_ONCE] = "once",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult);
+
+const char* job_type_to_access_method(JobType t) {
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART))
+ return "start";
+ else if (t == JOB_STOP)
+ return "stop";
+ else
+ return "reload";
+}
+
+/*
+ * assume_dep assumed dependency between units (a is before/after b)
+ *
+ * Returns
+ * 0 jobs are independent,
+ * >0 a should run after b,
+ * <0 a should run before b,
+ *
+ * The logic means that for a service a and a service b where b.After=a:
+ *
+ * start a + start b → 1st step start a, 2nd step start b
+ * start a + stop b → 1st step stop b, 2nd step start a
+ * stop a + start b → 1st step stop a, 2nd step start b
+ * stop a + stop b → 1st step stop b, 2nd step stop a
+ *
+ * This has the side effect that restarts are properly synchronized too.
+ */
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep) {
+ assert(a);
+ assert(b);
+ assert(a->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(b->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(IN_SET(assume_dep, UNIT_ATOM_AFTER, UNIT_ATOM_BEFORE));
+
+ /* Trivial cases first */
+ if (a->type == JOB_NOP || b->type == JOB_NOP)
+ return 0;
+
+ if (a->ignore_order || b->ignore_order)
+ return 0;
+
+ if (assume_dep == UNIT_ATOM_AFTER)
+ return -job_compare(b, a, UNIT_ATOM_BEFORE);
+
+ /* Let's make it simple, JOB_STOP goes always first (in case both ua and ub stop, then ub's stop goes
+ * first anyway). JOB_RESTART is JOB_STOP in disguise (before it is patched to JOB_START). */
+ if (IN_SET(b->type, JOB_STOP, JOB_RESTART))
+ return 1;
+ else
+ return -1;
+}
+
+void job_set_activation_details(Job *j, ActivationDetails *info) {
+ /* Existing (older) ActivationDetails win, newer ones are discarded. */
+ if (!j || j->activation_details || !info)
+ return; /* Nothing to do. */
+
+ j->activation_details = activation_details_ref(info);
+}
diff --git a/src/core/job.h b/src/core/job.h
new file mode 100644
index 0000000..807c258
--- /dev/null
+++ b/src/core/job.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-event.h"
+
+#include "list.h"
+#include "unit-dependency-atom.h"
+#include "unit-name.h"
+#include "unit.h"
+
+typedef struct ActivationDetails ActivationDetails;
+typedef struct Job Job;
+typedef struct JobDependency JobDependency;
+typedef enum JobType JobType;
+typedef enum JobState JobState;
+typedef enum JobMode JobMode;
+typedef enum JobResult JobResult;
+
+/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */
+enum JobType {
+ JOB_START, /* if a unit does not support being started, we'll just wait until it becomes active */
+ JOB_VERIFY_ACTIVE,
+
+ JOB_STOP,
+
+ JOB_RELOAD, /* if running, reload */
+
+ /* Note that restarts are first treated like JOB_STOP, but
+ * then instead of finishing are patched to become
+ * JOB_START. */
+ JOB_RESTART, /* If running, stop. Then start unconditionally. */
+
+ _JOB_TYPE_MAX_MERGING,
+
+ /* JOB_NOP can enter into a transaction, but as it won't pull in
+ * any dependencies and it uses the special 'nop_job' slot in Unit,
+ * it won't have to merge with anything (except possibly into another
+ * JOB_NOP, previously installed). JOB_NOP is special-cased in
+ * job_type_is_*() functions so that the transaction can be
+ * activated. */
+ JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */
+
+ _JOB_TYPE_MAX_IN_TRANSACTION,
+
+ /* JOB_TRY_RESTART can never appear in a transaction, because
+ * it always collapses into JOB_RESTART or JOB_NOP before entering.
+ * Thus we never need to merge it with anything. */
+ JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */
+
+ /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */
+ JOB_TRY_RELOAD,
+
+ /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result
+ * from transaction merging (there's no way for JOB_RELOAD and
+ * JOB_START to meet in one transaction). It can result from a merge
+ * during job installation, but then it will immediately collapse into
+ * one of the two simpler types. */
+ JOB_RELOAD_OR_START, /* if running, reload, otherwise start */
+
+ _JOB_TYPE_MAX,
+ _JOB_TYPE_INVALID = -EINVAL,
+};
+
+enum JobState {
+ JOB_WAITING,
+ JOB_RUNNING,
+ _JOB_STATE_MAX,
+ _JOB_STATE_INVALID = -EINVAL,
+};
+
+enum JobMode {
+ JOB_FAIL, /* Fail if a conflicting job is already queued */
+ JOB_REPLACE, /* Replace an existing conflicting job */
+ JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */
+ JOB_ISOLATE, /* Start a unit, and stop all others */
+ JOB_FLUSH, /* Flush out all other queued jobs when queueing this one */
+ JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */
+ JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */
+ JOB_TRIGGERING, /* Adds TRIGGERED_BY dependencies to the same transaction */
+ _JOB_MODE_MAX,
+ _JOB_MODE_INVALID = -EINVAL,
+};
+
+enum JobResult {
+ JOB_DONE, /* Job completed successfully (or skipped due to an unmet ConditionXYZ=) */
+ JOB_CANCELED, /* Job canceled by a conflicting job installation or by explicit cancel request */
+ JOB_TIMEOUT, /* Job timeout elapsed */
+ JOB_FAILED, /* Job failed */
+ JOB_DEPENDENCY, /* A required dependency job did not result in JOB_DONE */
+ JOB_SKIPPED, /* Negative result of JOB_VERIFY_ACTIVE or skip due to ExecCondition= */
+ JOB_INVALID, /* JOB_RELOAD of inactive unit */
+ JOB_ASSERT, /* Couldn't start a unit, because an assert didn't hold */
+ JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */
+ JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */
+ JOB_ONCE, /* Unit was started before, and hence can't be started again */
+ _JOB_RESULT_MAX,
+ _JOB_RESULT_INVALID = -EINVAL,
+};
+
+#include "unit.h"
+
+struct JobDependency {
+ /* Encodes that the 'subject' job needs the 'object' job in
+ * some way. This structure is used only while building a transaction. */
+ Job *subject;
+ Job *object;
+
+ LIST_FIELDS(JobDependency, subject);
+ LIST_FIELDS(JobDependency, object);
+
+ bool matters:1;
+ bool conflicts:1;
+};
+
+struct Job {
+ Manager *manager;
+ Unit *unit;
+
+ LIST_FIELDS(Job, transaction);
+ LIST_FIELDS(Job, dbus_queue);
+ LIST_FIELDS(Job, gc_queue);
+
+ LIST_HEAD(JobDependency, subject_list);
+ LIST_HEAD(JobDependency, object_list);
+
+ /* Used for graph algs as a "I have been here" marker */
+ Job* marker;
+ unsigned generation;
+
+ uint32_t id;
+
+ JobType type;
+ JobState state;
+
+ sd_event_source *timer_event_source;
+ usec_t begin_usec;
+ usec_t begin_running_usec;
+
+ /*
+ * This tracks where to send signals, and also which clients
+ * are allowed to call DBus methods on the job (other than
+ * root).
+ *
+ * There can be more than one client, because of job merging.
+ */
+ sd_bus_track *bus_track;
+ char **deserialized_clients;
+
+ JobResult result;
+
+ unsigned run_queue_idx;
+
+ /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+ ActivationDetails *activation_details;
+
+ bool installed:1;
+ bool in_run_queue:1;
+ bool matters_to_anchor:1;
+ bool in_dbus_queue:1;
+ bool sent_dbus_new_signal:1;
+ bool ignore_order:1;
+ bool irreversible:1;
+ bool in_gc_queue:1;
+ bool ref_by_private_bus:1;
+};
+
+Job* job_new(Unit *unit, JobType type);
+Job* job_new_raw(Unit *unit);
+void job_unlink(Job *job);
+Job* job_free(Job *job);
+Job* job_install(Job *j);
+int job_install_deserialized(Job *j);
+void job_uninstall(Job *j);
+void job_dump(Job *j, FILE *f, const char *prefix);
+int job_serialize(Job *j, FILE *f);
+int job_deserialize(Job *j, FILE *f);
+int job_coldplug(Job *j);
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts);
+void job_dependency_free(JobDependency *l);
+
+int job_merge(Job *j, Job *other);
+
+JobType job_type_lookup_merge(JobType a, JobType b) _pure_;
+
+_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) {
+ return job_type_lookup_merge(a, b) >= 0;
+}
+
+_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) {
+ return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b);
+}
+
+_pure_ static inline bool job_type_is_superset(JobType a, JobType b) {
+ /* Checks whether operation a is a "superset" of b in its actions */
+ if (b == JOB_NOP)
+ return true;
+ if (a == JOB_NOP)
+ return false;
+ return a == job_type_lookup_merge(a, b);
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_;
+
+/* Collapses a state-dependent job type into a simpler type by observing
+ * the state of the unit which it is going to be applied to. */
+JobType job_type_collapse(JobType t, Unit *u);
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u);
+
+void job_add_to_run_queue(Job *j);
+void job_add_to_dbus_queue(Job *j);
+
+int job_start_timer(Job *j, bool job_running);
+
+int job_run_and_invalidate(Job *j);
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already);
+
+char *job_dbus_path(Job *j);
+
+void job_shutdown_magic(Job *j);
+
+int job_get_timeout(Job *j, usec_t *timeout);
+
+bool job_may_gc(Job *j);
+void job_add_to_gc_queue(Job *j);
+
+int job_get_before(Job *j, Job*** ret);
+int job_get_after(Job *j, Job*** ret);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free);
+
+const char* job_type_to_string(JobType t) _const_;
+JobType job_type_from_string(const char *s) _pure_;
+
+const char* job_state_to_string(JobState t) _const_;
+JobState job_state_from_string(const char *s) _pure_;
+
+const char* job_mode_to_string(JobMode t) _const_;
+JobMode job_mode_from_string(const char *s) _pure_;
+
+const char* job_result_to_string(JobResult t) _const_;
+JobResult job_result_from_string(const char *s) _pure_;
+
+const char* job_type_to_access_method(JobType t);
+
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep);
+
+void job_set_activation_details(Job *j, ActivationDetails *info);
diff --git a/src/core/kill.c b/src/core/kill.c
new file mode 100644
index 0000000..e858ae9
--- /dev/null
+++ b/src/core/kill.c
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "kill.h"
+#include "signal-util.h"
+#include "string-table.h"
+#include "util.h"
+
+void kill_context_init(KillContext *c) {
+ assert(c);
+
+ c->kill_signal = SIGTERM;
+ /* restart_kill_signal is unset by default and we fall back to kill_signal */
+ c->final_kill_signal = SIGKILL;
+ c->send_sigkill = true;
+ c->send_sighup = false;
+ c->watchdog_signal = SIGABRT;
+}
+
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix) {
+ assert(c);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sKillMode: %s\n"
+ "%sKillSignal: SIG%s\n"
+ "%sRestartKillSignal: SIG%s\n"
+ "%sFinalKillSignal: SIG%s\n"
+ "%sSendSIGKILL: %s\n"
+ "%sSendSIGHUP: %s\n",
+ prefix, kill_mode_to_string(c->kill_mode),
+ prefix, signal_to_string(c->kill_signal),
+ prefix, signal_to_string(restart_kill_signal(c)),
+ prefix, signal_to_string(c->final_kill_signal),
+ prefix, yes_no(c->send_sigkill),
+ prefix, yes_no(c->send_sighup));
+}
+
+static const char* const kill_mode_table[_KILL_MODE_MAX] = {
+ [KILL_CONTROL_GROUP] = "control-group",
+ [KILL_PROCESS] = "process",
+ [KILL_MIXED] = "mixed",
+ [KILL_NONE] = "none",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode);
+
+static const char* const kill_who_table[_KILL_WHO_MAX] = {
+ [KILL_MAIN] = "main",
+ [KILL_CONTROL] = "control",
+ [KILL_ALL] = "all",
+ [KILL_MAIN_FAIL] = "main-fail",
+ [KILL_CONTROL_FAIL] = "control-fail",
+ [KILL_ALL_FAIL] = "all-fail",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho);
diff --git a/src/core/kill.h b/src/core/kill.h
new file mode 100644
index 0000000..dbf884d
--- /dev/null
+++ b/src/core/kill.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct KillContext KillContext;
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "macro.h"
+
+typedef enum KillMode {
+ /* The kill mode is a property of a unit. */
+ KILL_CONTROL_GROUP = 0,
+ KILL_PROCESS,
+ KILL_MIXED,
+ KILL_NONE,
+ _KILL_MODE_MAX,
+ _KILL_MODE_INVALID = -EINVAL,
+} KillMode;
+
+struct KillContext {
+ KillMode kill_mode;
+ int kill_signal;
+ int restart_kill_signal;
+ int final_kill_signal;
+ int watchdog_signal;
+ bool send_sigkill;
+ bool send_sighup;
+};
+
+typedef enum KillWho {
+ /* Kill who is a property of an operation */
+ KILL_MAIN,
+ KILL_CONTROL,
+ KILL_ALL,
+ KILL_MAIN_FAIL,
+ KILL_CONTROL_FAIL,
+ KILL_ALL_FAIL,
+ _KILL_WHO_MAX,
+ _KILL_WHO_INVALID = -EINVAL,
+} KillWho;
+
+void kill_context_init(KillContext *c);
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix);
+
+const char *kill_mode_to_string(KillMode k) _const_;
+KillMode kill_mode_from_string(const char *s) _pure_;
+
+const char *kill_who_to_string(KillWho k) _const_;
+KillWho kill_who_from_string(const char *s) _pure_;
+
+static inline int restart_kill_signal(const KillContext *c) {
+ if (c->restart_kill_signal != 0)
+ return c->restart_kill_signal;
+ return c->kill_signal;
+}
diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c
new file mode 100644
index 0000000..15337d0
--- /dev/null
+++ b/src/core/kmod-setup.c
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "fileio.h"
+#include "kmod-setup.h"
+#include "macro.h"
+#include "recurse-dir.h"
+#include "string-util.h"
+#include "strv.h"
+#include "virt.h"
+
+#if HAVE_KMOD
+#include "module-util.h"
+
+static void systemd_kmod_log(
+ void *data,
+ int priority,
+ const char *file, int line,
+ const char *fn,
+ const char *format,
+ va_list args) {
+
+ /* library logging is enabled at debug only */
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_internalv(LOG_DEBUG, 0, file, line, fn, format, args);
+ REENABLE_WARNING;
+}
+
+static int has_virtio_rng_recurse_dir_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ _cleanup_free_ char *alias = NULL;
+ int r;
+
+ if (event != RECURSE_DIR_ENTRY)
+ return RECURSE_DIR_CONTINUE;
+
+ if (de->d_type != DT_REG)
+ return RECURSE_DIR_CONTINUE;
+
+ if (!streq(de->d_name, "modalias"))
+ return RECURSE_DIR_CONTINUE;
+
+ r = read_one_line_file(path, &alias);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read %s, ignoring: %m", path);
+ return RECURSE_DIR_LEAVE_DIRECTORY;
+ }
+
+ if (STARTSWITH_SET(alias, "pci:v00001AF4d00001005", "pci:v00001AF4d00001044"))
+ return 1;
+
+ return RECURSE_DIR_LEAVE_DIRECTORY;
+}
+
+static bool has_virtio_rng(void) {
+ int r;
+
+ /* Directory traversal might be slow, hence let's do a cheap check first if it's even worth it */
+ if (detect_vm() == VIRTUALIZATION_NONE)
+ return false;
+
+ r = recurse_dir_at(
+ AT_FDCWD,
+ "/sys/devices/pci0000:00",
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ 2,
+ RECURSE_DIR_ENSURE_TYPE,
+ has_virtio_rng_recurse_dir_cb,
+ NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine whether host has virtio-rng device, ignoring: %m");
+
+ return r > 0;
+}
+
+static bool in_qemu(void) {
+ return IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_QEMU);
+}
+#endif
+
+int kmod_setup(void) {
+#if HAVE_KMOD
+
+ static const struct {
+ const char *module;
+ const char *path;
+ bool warn_if_unavailable:1;
+ bool warn_if_module:1;
+ bool (*condition_fn)(void);
+ } kmod_table[] = {
+ /* This one we need to load explicitly, since auto-loading on use doesn't work
+ * before udev created the ghost device nodes, and we need it earlier than that. */
+ { "autofs4", "/sys/class/misc/autofs", true, false, NULL },
+
+ /* This one we need to load explicitly, since auto-loading of IPv6 is not done when
+ * we try to configure ::1 on the loopback device. */
+ { "ipv6", "/sys/module/ipv6", false, true, NULL },
+
+ /* This should never be a module */
+ { "unix", "/proc/net/unix", true, true, NULL },
+
+#if HAVE_LIBIPTC
+ /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */
+ { "ip_tables", "/proc/net/ip_tables_names", false, false, NULL },
+#endif
+ /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */
+ { "virtio_rng", NULL, false, false, has_virtio_rng },
+
+ /* qemu_fw_cfg would be loaded by udev later, but we want to import credentials from it super early */
+ { "qemu_fw_cfg", "/sys/firmware/qemu_fw_cfg", false, false, in_qemu },
+
+ /* dmi-sysfs is needed to import credentials from it super early */
+ { "dmi-sysfs", "/sys/firmware/dmi/entries", false, false, NULL },
+ };
+ _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL;
+ unsigned i;
+
+ if (have_effective_cap(CAP_SYS_MODULE) == 0)
+ return 0;
+
+ for (i = 0; i < ELEMENTSOF(kmod_table); i++) {
+ if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0)
+ continue;
+
+ if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn())
+ continue;
+
+ if (kmod_table[i].warn_if_module)
+ log_debug("Your kernel apparently lacks built-in %s support. Might be "
+ "a good idea to compile it in. We'll now try to work around "
+ "this by loading the module...", kmod_table[i].module);
+
+ if (!ctx) {
+ ctx = kmod_new(NULL, NULL);
+ if (!ctx)
+ return log_oom();
+
+ kmod_set_log_fn(ctx, systemd_kmod_log, NULL);
+ kmod_load_resources(ctx);
+ }
+
+ (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable);
+ }
+
+#endif
+ return 0;
+}
diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h
new file mode 100644
index 0000000..1c842d3
--- /dev/null
+++ b/src/core/kmod-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int kmod_setup(void);
diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c
new file mode 100644
index 0000000..53d2a3d
--- /dev/null
+++ b/src/core/load-dropin.c
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fs-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) {
+ _cleanup_strv_free_ char **paths = NULL;
+ int r;
+
+ r = unit_file_find_dropin_paths(NULL,
+ u->manager->lookup_paths.search_path,
+ u->manager->unit_path_cache,
+ dir_suffix, NULL,
+ u->id, u->aliases,
+ &paths);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, paths) {
+ _cleanup_free_ char *target = NULL;
+ const char *entry;
+
+ entry = basename(*p);
+
+ if (null_or_empty_path(*p) > 0) {
+ /* an error usually means an invalid symlink, which is not a mask */
+ log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.",
+ unit_dependency_to_string(dependency), entry, *p);
+ continue;
+ }
+
+ r = is_symlink(*p);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+ if (r == 0) {
+ log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+
+ if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) {
+ log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+
+ r = readlink_malloc(*p, &target);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p);
+ continue;
+ }
+
+ /* We don't treat this as an error, especially because we didn't check this for a
+ * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */
+ r = unit_symlink_name_compatible(entry, basename(target), u->instance);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m",
+ entry, basename(target));
+ continue;
+ }
+ if (r == 0)
+ log_unit_warning(u, "%s dependency dropin %s target %s has different name",
+ unit_dependency_to_string(dependency), *p, target);
+
+ r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m",
+ unit_dependency_to_string(dependency), entry);
+ }
+
+ return 0;
+}
+
+int unit_load_dropin(Unit *u) {
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(u);
+
+ /* Load dependencies from .wants and .requires directories */
+ r = process_deps(u, UNIT_WANTS, ".wants");
+ if (r < 0)
+ return r;
+
+ r = process_deps(u, UNIT_REQUIRES, ".requires");
+ if (r < 0)
+ return r;
+
+ /* Load .conf dropins */
+ r = unit_find_dropin_paths(u, &l);
+ if (r <= 0)
+ return 0;
+
+ if (!u->dropin_paths)
+ u->dropin_paths = TAKE_PTR(l);
+ else {
+ r = strv_extend_strv(&u->dropin_paths, l, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ u->dropin_mtime = 0;
+ STRV_FOREACH(f, u->dropin_paths) {
+ struct stat st;
+
+ r = config_parse(u->id, *f, NULL,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0, u, &st);
+ if (r > 0)
+ u->dropin_mtime = MAX(u->dropin_mtime, timespec_load(&st.st_mtim));
+ }
+
+ return 0;
+}
diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h
new file mode 100644
index 0000000..f0b87d3
--- /dev/null
+++ b/src/core/load-dropin.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "dropin.h"
+#include "unit.h"
+
+/* Read service data supplementary drop-in directories */
+
+static inline int unit_find_dropin_paths(Unit *u, char ***paths) {
+ assert(u);
+
+ return unit_file_find_dropin_paths(NULL,
+ u->manager->lookup_paths.search_path,
+ u->manager->unit_path_cache,
+ ".d", ".conf",
+ u->id, u->aliases,
+ paths);
+}
+
+int unit_load_dropin(Unit *u);
diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk
new file mode 100644
index 0000000..a1b7d1c
--- /dev/null
+++ b/src/core/load-fragment-gperf-nulstr.awk
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+BEGIN{
+ keywords=0 ; FS="," ;
+ print "extern const char load_fragment_gperf_nulstr[];" ;
+ print "const char load_fragment_gperf_nulstr[] ="
+}
+keyword==1 {
+ print "\"" $1 "\\0\""
+}
+/%%/ {
+ keyword=1
+}
+END {
+ print ";"
+}
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
new file mode 100644
index 0000000..81a5971
--- /dev/null
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -0,0 +1,564 @@
+{# SPDX-License-Identifier: LGPL-2.1-or-later #}
+
+{%- macro EXEC_CONTEXT_CONFIG_ITEMS(type) -%}
+{# Define the context options only once #}
+{{type}}.WorkingDirectory, config_parse_working_directory, 0, offsetof({{type}}, exec_context)
+{{type}}.RootDirectory, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_directory)
+{{type}}.RootImage, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_image)
+{{type}}.RootImageOptions, config_parse_root_image_options, 0, offsetof({{type}}, exec_context)
+{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context)
+{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context)
+{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity)
+{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories)
+{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context)
+{{type}}.MountImages, config_parse_mount_images, 0, offsetof({{type}}, exec_context)
+{{type}}.User, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.user)
+{{type}}.Group, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.group)
+{{type}}.SupplementaryGroups, config_parse_user_group_strv_compat, 0, offsetof({{type}}, exec_context.supplementary_groups)
+{{type}}.Nice, config_parse_exec_nice, 0, offsetof({{type}}, exec_context)
+{{type}}.OOMScoreAdjust, config_parse_exec_oom_score_adjust, 0, offsetof({{type}}, exec_context)
+{{type}}.CoredumpFilter, config_parse_exec_coredump_filter, 0, offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingClass, config_parse_exec_io_class, 0, offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingPriority, config_parse_exec_io_priority, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof({{type}}, exec_context.cpu_sched_reset_on_fork)
+{{type}}.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof({{type}}, exec_context)
+{{type}}.NUMAPolicy, config_parse_numa_policy, 0, offsetof({{type}}, exec_context.numa_policy.type)
+{{type}}.NUMAMask, config_parse_numa_mask, 0, offsetof({{type}}, exec_context.numa_policy)
+{{type}}.UMask, config_parse_mode, 0, offsetof({{type}}, exec_context.umask)
+{{type}}.Environment, config_parse_environ, 0, offsetof({{type}}, exec_context.environment)
+{{type}}.EnvironmentFile, config_parse_unit_env_file, 0, offsetof({{type}}, exec_context.environment_files)
+{{type}}.PassEnvironment, config_parse_pass_environ, 0, offsetof({{type}}, exec_context.pass_environment)
+{{type}}.UnsetEnvironment, config_parse_unset_environ, 0, offsetof({{type}}, exec_context.unset_environment)
+{{type}}.DynamicUser, config_parse_bool, true, offsetof({{type}}, exec_context.dynamic_user)
+{{type}}.RemoveIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.remove_ipc)
+{{type}}.StandardInput, config_parse_exec_input, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardOutput, config_parse_exec_output, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardError, config_parse_exec_output, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardInputText, config_parse_exec_input_text, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardInputData, config_parse_exec_input_data, 0, offsetof({{type}}, exec_context)
+{{type}}.TTYPath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.tty_path)
+{{type}}.TTYReset, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_reset)
+{{type}}.TTYVHangup, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vhangup)
+{{type}}.TTYVTDisallocate, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vt_disallocate)
+{{type}}.TTYRows, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_rows)
+{{type}}.TTYColumns, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_cols)
+{{type}}.SyslogIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.syslog_identifier)
+{{type}}.SyslogFacility, config_parse_log_facility, 0, offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevel, config_parse_log_level, 0, offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevelPrefix, config_parse_bool, 0, offsetof({{type}}, exec_context.syslog_level_prefix)
+{{type}}.LogLevelMax, config_parse_log_level, 0, offsetof({{type}}, exec_context.log_level_max)
+{{type}}.LogRateLimitIntervalSec, config_parse_sec, 0, offsetof({{type}}, exec_context.log_ratelimit_interval_usec)
+{{type}}.LogRateLimitBurst, config_parse_unsigned, 0, offsetof({{type}}, exec_context.log_ratelimit_burst)
+{{type}}.LogExtraFields, config_parse_log_extra_fields, 0, offsetof({{type}}, exec_context)
+{{type}}.Capabilities, config_parse_warn_compat, DISABLED_LEGACY, offsetof({{type}}, exec_context)
+{{type}}.SecureBits, config_parse_exec_secure_bits, 0, offsetof({{type}}, exec_context.secure_bits)
+{{type}}.CapabilityBoundingSet, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_bounding_set)
+{{type}}.AmbientCapabilities, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_ambient_set)
+{{type}}.TimerSlackNSec, config_parse_nsec, 0, offsetof({{type}}, exec_context.timer_slack_nsec)
+{{type}}.NoNewPrivileges, config_parse_bool, 0, offsetof({{type}}, exec_context.no_new_privileges)
+{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
+{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
+{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
+{% if HAVE_SECCOMP %}
+{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
+{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)
+{{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context)
+{{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context)
+{{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute)
+{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context)
+{{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime)
+{{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid)
+{{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context)
+{{type}}.LockPersonality, config_parse_bool, 0, offsetof({{type}}, exec_context.lock_personality)
+{% else %}
+{{type}}.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallLog, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictSUIDSGID, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.RestrictFileSystems, config_parse_restrict_filesystems, 0, offsetof({{type}}, exec_context)
+{{type}}.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof({{type}}, exec_context.rlimit)
+{{type}}.ReadWriteDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessibleDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ReadWritePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.exec_paths)
+{{type}}.NoExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.no_exec_paths)
+{{type}}.ExecSearchPath, config_parse_colon_separated_paths, 0, offsetof({{type}}, exec_context.exec_search_path)
+{{type}}.BindPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context)
+{{type}}.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context)
+{{type}}.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof({{type}}, exec_context)
+{{type}}.PrivateTmp, config_parse_bool, 0, offsetof({{type}}, exec_context.private_tmp)
+{{type}}.PrivateDevices, config_parse_bool, 0, offsetof({{type}}, exec_context.private_devices)
+{{type}}.ProtectKernelTunables, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_tunables)
+{{type}}.ProtectKernelModules, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_modules)
+{{type}}.ProtectKernelLogs, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_logs)
+{{type}}.ProtectClock, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_clock)
+{{type}}.ProtectControlGroups, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_control_groups)
+{{type}}.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.network_namespace_path)
+{{type}}.IPCNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.ipc_namespace_path)
+{{type}}.LogNamespace, config_parse_log_namespace, 0, offsetof({{type}}, exec_context)
+{{type}}.PrivateNetwork, config_parse_bool, 0, offsetof({{type}}, exec_context.private_network)
+{{type}}.PrivateUsers, config_parse_bool, 0, offsetof({{type}}, exec_context.private_users)
+{{type}}.PrivateMounts, config_parse_bool, 0, offsetof({{type}}, exec_context.private_mounts)
+{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc)
+{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
+{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
+{{type}}.MountFlags, config_parse_exec_mount_flags, 0, offsetof({{type}}, exec_context.mount_flags)
+{{type}}.MountAPIVFS, config_parse_exec_mount_apivfs, 0, offsetof({{type}}, exec_context)
+{{type}}.Personality, config_parse_personality, 0, offsetof({{type}}, exec_context.personality)
+{{type}}.RuntimeDirectoryPreserve, config_parse_runtime_preserve_mode, 0, offsetof({{type}}, exec_context.runtime_directory_preserve_mode)
+{{type}}.RuntimeDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode)
+{{type}}.RuntimeDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME])
+{{type}}.StateDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE].mode)
+{{type}}.StateDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE])
+{{type}}.CacheDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE].mode)
+{{type}}.CacheDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE])
+{{type}}.LogsDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS].mode)
+{{type}}.LogsDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS])
+{{type}}.ConfigurationDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode)
+{{type}}.ConfigurationDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION])
+{{type}}.SetCredential, config_parse_set_credential, 0, offsetof({{type}}, exec_context)
+{{type}}.SetCredentialEncrypted, config_parse_set_credential, 1, offsetof({{type}}, exec_context)
+{{type}}.LoadCredential, config_parse_load_credential, 0, offsetof({{type}}, exec_context)
+{{type}}.LoadCredentialEncrypted, config_parse_load_credential, 1, offsetof({{type}}, exec_context)
+{{type}}.TimeoutCleanSec, config_parse_sec, 0, offsetof({{type}}, exec_context.timeout_clean_usec)
+{% if HAVE_PAM %}
+{{type}}.PAMName, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.pam_name)
+{% else %}
+{{type}}.PAMName, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.IgnoreSIGPIPE, config_parse_bool, 0, offsetof({{type}}, exec_context.ignore_sigpipe)
+{{type}}.UtmpIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.utmp_id)
+{{type}}.UtmpMode, config_parse_exec_utmp_mode, 0, offsetof({{type}}, exec_context.utmp_mode)
+{% if HAVE_SELINUX %}
+{{type}}.SELinuxContext, config_parse_exec_selinux_context, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SELinuxContext, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if HAVE_APPARMOR %}
+{{type}}.AppArmorProfile, config_parse_exec_apparmor_profile, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.AppArmorProfile, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if ENABLE_SMACK %}
+{{type}}.SmackProcessLabel, config_parse_exec_smack_process_label, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.ProtectHostname, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_hostname)
+{%- endmacro -%}
+
+{%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.SendSIGKILL, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sigkill)
+{{type}}.SendSIGHUP, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sighup)
+{{type}}.KillMode, config_parse_kill_mode, 0, offsetof({{type}}, kill_context.kill_mode)
+{{type}}.KillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.kill_signal)
+{{type}}.RestartKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.restart_kill_signal)
+{{type}}.FinalKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.final_kill_signal)
+{{type}}.WatchdogSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.watchdog_signal)
+{%- endmacro -%}
+
+{%- macro CGROUP_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.Slice, config_parse_unit_slice, 0, 0
+{{type}}.AllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_cpus)
+{{type}}.StartupAllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_cpus)
+{{type}}.AllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_mems)
+{{type}}.StartupAllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_mems)
+{{type}}.CPUAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.cpu_accounting)
+{{type}}.CPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.cpu_weight)
+{{type}}.StartupCPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.startup_cpu_weight)
+{{type}}.CPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.cpu_shares)
+{{type}}.StartupCPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.startup_cpu_shares)
+{{type}}.CPUQuota, config_parse_cpu_quota, 0, offsetof({{type}}, cgroup_context)
+{{type}}.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof({{type}}, cgroup_context.cpu_quota_period_usec)
+{{type}}.MemoryAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.memory_accounting)
+{{type}}.MemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryHigh, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLimit, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DeviceAllow, config_parse_device_allow, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DevicePolicy, config_parse_device_policy, 0, offsetof({{type}}, cgroup_context.device_policy)
+{{type}}.IOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.io_accounting)
+{{type}}.IOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.io_weight)
+{{type}}.StartupIOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.startup_io_weight)
+{{type}}.IODeviceWeight, config_parse_io_device_weight, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOReadBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOReadIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IODeviceLatencyTargetSec, config_parse_io_device_latency, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.blockio_accounting)
+{{type}}.BlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.blockio_weight)
+{{type}}.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.startup_blockio_weight)
+{{type}}.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context)
+{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting)
+{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max)
+{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting)
+{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow)
+{{type}}.IPAddressDeny, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_deny)
+{{type}}.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_ingress)
+{{type}}.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_egress)
+{{type}}.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_swap)
+{{type}}.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure)
+{{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit)
+{{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference)
+{{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0
+{{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context)
+{{type}}.SocketBindAllow, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_allow)
+{{type}}.SocketBindDeny, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_deny)
+{{type}}.RestrictNetworkInterfaces, config_parse_restrict_network_interfaces, 0, offsetof({{type}}, cgroup_context)
+{%- endmacro -%}
+
+%{
+#if __GNUC__ >= 7
+_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+#endif
+#include <stddef.h>
+#include "all-units.h"
+#include "conf-parser.h"
+#include "in-addr-prefix-util.h"
+#include "load-fragment.h"
+%}
+struct ConfigPerfItem;
+%null_strings
+%language=ANSI-C
+%define slot-name section_and_lvalue
+%define hash-function-name load_fragment_gperf_hash
+%define lookup-function-name load_fragment_gperf_lookup
+%readonly-tables
+%omit-struct-type
+%struct-type
+%includes
+%%
+Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description)
+Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation)
+Unit.SourcePath, config_parse_unit_path_printf, 0, offsetof(Unit, source_path)
+Unit.Requires, config_parse_unit_deps, UNIT_REQUIRES, 0
+Unit.Requisite, config_parse_unit_deps, UNIT_REQUISITE, 0
+Unit.Wants, config_parse_unit_deps, UNIT_WANTS, 0
+Unit.BindsTo, config_parse_unit_deps, UNIT_BINDS_TO, 0
+Unit.BindTo, config_parse_unit_deps, UNIT_BINDS_TO, 0
+Unit.Upholds, config_parse_unit_deps, UNIT_UPHOLDS, 0
+Unit.Conflicts, config_parse_unit_deps, UNIT_CONFLICTS, 0
+Unit.Before, config_parse_unit_deps, UNIT_BEFORE, 0
+Unit.After, config_parse_unit_deps, UNIT_AFTER, 0
+Unit.OnSuccess, config_parse_unit_deps, UNIT_ON_SUCCESS, 0
+Unit.OnFailure, config_parse_unit_deps, UNIT_ON_FAILURE, 0
+Unit.PropagatesReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0
+Unit.PropagateReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0
+Unit.ReloadPropagatedFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0
+Unit.PropagateReloadFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0
+Unit.PropagatesStopTo, config_parse_unit_deps, UNIT_PROPAGATES_STOP_TO, 0
+Unit.StopPropagatedFrom, config_parse_unit_deps, UNIT_STOP_PROPAGATED_FROM, 0
+Unit.PartOf, config_parse_unit_deps, UNIT_PART_OF, 0
+Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0
+Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0
+Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0
+Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0
+Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded)
+Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start)
+Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
+Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate)
+Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies)
+Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode)
+Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode)
+{# The following is a legacy alias name for compatibility #}
+Unit.OnFailureIsolate, config_parse_job_mode_isolate, 0, offsetof(Unit, on_failure_job_mode)
+Unit.IgnoreOnIsolate, config_parse_bool, 0, offsetof(Unit, ignore_on_isolate)
+Unit.IgnoreOnSnapshot, config_parse_warn_compat, DISABLED_LEGACY, 0
+Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0
+Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0
+Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action)
+Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg)
+Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+{# The following is a legacy alias name for compatibility #}
+Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+Unit.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst)
+Unit.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action)
+Unit.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action)
+Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action)
+Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status)
+Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status)
+Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg)
+Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions)
+Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions)
+Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions)
+Unit.ConditionPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, conditions)
+Unit.ConditionPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, conditions)
+Unit.ConditionPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, conditions)
+Unit.ConditionPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, conditions)
+Unit.ConditionDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, conditions)
+Unit.ConditionFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, conditions)
+Unit.ConditionFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, conditions)
+Unit.ConditionNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, conditions)
+Unit.ConditionFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, conditions)
+Unit.ConditionArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, conditions)
+Unit.ConditionFirmware, config_parse_unit_condition_string, CONDITION_FIRMWARE, offsetof(Unit, conditions)
+Unit.ConditionVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, conditions)
+Unit.ConditionHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, conditions)
+Unit.ConditionKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, conditions)
+Unit.ConditionKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, conditions)
+Unit.ConditionCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, conditions)
+Unit.ConditionSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, conditions)
+Unit.ConditionCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, conditions)
+Unit.ConditionACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, conditions)
+Unit.ConditionMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, conditions)
+Unit.ConditionCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, conditions)
+Unit.ConditionCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, conditions)
+Unit.ConditionEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, conditions)
+Unit.ConditionUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, conditions)
+Unit.ConditionGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, conditions)
+Unit.ConditionControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions)
+Unit.ConditionOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, conditions)
+Unit.ConditionMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, conditions)
+Unit.ConditionCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, conditions)
+Unit.ConditionIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, conditions)
+Unit.AssertPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, asserts)
+Unit.AssertPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, asserts)
+Unit.AssertPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, asserts)
+Unit.AssertPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, asserts)
+Unit.AssertPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, asserts)
+Unit.AssertPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, asserts)
+Unit.AssertPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, asserts)
+Unit.AssertDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, asserts)
+Unit.AssertFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, asserts)
+Unit.AssertFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, asserts)
+Unit.AssertNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, asserts)
+Unit.AssertFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, asserts)
+Unit.AssertArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, asserts)
+Unit.AssertVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, asserts)
+Unit.AssertHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, asserts)
+Unit.AssertKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, asserts)
+Unit.AssertKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, asserts)
+Unit.AssertCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, asserts)
+Unit.AssertSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, asserts)
+Unit.AssertCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, asserts)
+Unit.AssertACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, asserts)
+Unit.AssertMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, asserts)
+Unit.AssertCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, asserts)
+Unit.AssertCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, asserts)
+Unit.AssertEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, asserts)
+Unit.AssertUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, asserts)
+Unit.AssertGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, asserts)
+Unit.AssertControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts)
+Unit.AssertOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, asserts)
+Unit.AssertMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, asserts)
+Unit.AssertCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, asserts)
+Unit.AssertIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, asserts)
+Unit.CollectMode, config_parse_collect_mode, 0, offsetof(Unit, collect_mode)
+Service.PIDFile, config_parse_pid_file, 0, offsetof(Service, pid_file)
+Service.ExecCondition, config_parse_exec, SERVICE_EXEC_CONDITION, offsetof(Service, exec_command)
+Service.ExecStartPre, config_parse_exec, SERVICE_EXEC_START_PRE, offsetof(Service, exec_command)
+Service.ExecStart, config_parse_exec, SERVICE_EXEC_START, offsetof(Service, exec_command)
+Service.ExecStartPost, config_parse_exec, SERVICE_EXEC_START_POST, offsetof(Service, exec_command)
+Service.ExecReload, config_parse_exec, SERVICE_EXEC_RELOAD, offsetof(Service, exec_command)
+Service.ExecStop, config_parse_exec, SERVICE_EXEC_STOP, offsetof(Service, exec_command)
+Service.ExecStopPost, config_parse_exec, SERVICE_EXEC_STOP_POST, offsetof(Service, exec_command)
+Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec)
+Service.TimeoutSec, config_parse_service_timeout, 0, 0
+Service.TimeoutStartSec, config_parse_service_timeout, 0, 0
+Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec)
+Service.TimeoutAbortSec, config_parse_service_timeout_abort, 0, 0
+Service.TimeoutStartFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_start_failure_mode)
+Service.TimeoutStopFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_stop_failure_mode)
+Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec)
+Service.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Service, runtime_rand_extra_usec)
+Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec)
+{# The following five only exist for compatibility, they moved into Unit, see above #}
+Service.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+Service.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst)
+Service.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action)
+Service.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action)
+Service.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg)
+Service.Type, config_parse_service_type, 0, offsetof(Service, type)
+Service.ExitType, config_parse_service_exit_type, 0, offsetof(Service, exit_type)
+Service.Restart, config_parse_service_restart, 0, offsetof(Service, restart)
+Service.PermissionsStartOnly, config_parse_bool, 0, offsetof(Service, permissions_start_only)
+Service.RootDirectoryStartOnly, config_parse_bool, 0, offsetof(Service, root_directory_start_only)
+Service.RemainAfterExit, config_parse_bool, 0, offsetof(Service, remain_after_exit)
+Service.GuessMainPID, config_parse_bool, 0, offsetof(Service, guess_main_pid)
+Service.RestartPreventExitStatus, config_parse_set_status, 0, offsetof(Service, restart_prevent_status)
+Service.RestartForceExitStatus, config_parse_set_status, 0, offsetof(Service, restart_force_status)
+Service.SuccessExitStatus, config_parse_set_status, 0, offsetof(Service, success_status)
+Service.SysVStartPriority, config_parse_warn_compat, DISABLED_LEGACY, 0
+Service.NonBlocking, config_parse_bool, 0, offsetof(Service, exec_context.non_blocking)
+Service.BusName, config_parse_bus_name, 0, offsetof(Service, bus_name)
+Service.FileDescriptorStoreMax, config_parse_unsigned, 0, offsetof(Service, n_fd_store_max)
+Service.NotifyAccess, config_parse_notify_access, 0, offsetof(Service, notify_access)
+Service.Sockets, config_parse_service_sockets, 0, 0
+Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0
+Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors)
+Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings)
+Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Service') }}
+Socket.ListenStream, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenDatagram, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenSequentialPacket, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenFIFO, config_parse_socket_listen, SOCKET_FIFO, 0
+Socket.ListenNetlink, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenSpecial, config_parse_socket_listen, SOCKET_SPECIAL, 0
+Socket.ListenMessageQueue, config_parse_socket_listen, SOCKET_MQUEUE, 0
+Socket.ListenUSBFunction, config_parse_socket_listen, SOCKET_USB_FUNCTION, 0
+Socket.SocketProtocol, config_parse_socket_protocol, 0, offsetof(Socket, socket_protocol)
+Socket.BindIPv6Only, config_parse_socket_bind, 0, offsetof(Socket, bind_ipv6_only)
+Socket.Backlog, config_parse_unsigned, 0, offsetof(Socket, backlog)
+Socket.BindToDevice, config_parse_socket_bindtodevice, 0, 0
+Socket.ExecStartPre, config_parse_exec, SOCKET_EXEC_START_PRE, offsetof(Socket, exec_command)
+Socket.ExecStartPost, config_parse_exec, SOCKET_EXEC_START_POST, offsetof(Socket, exec_command)
+Socket.ExecStopPre, config_parse_exec, SOCKET_EXEC_STOP_PRE, offsetof(Socket, exec_command)
+Socket.ExecStopPost, config_parse_exec, SOCKET_EXEC_STOP_POST, offsetof(Socket, exec_command)
+Socket.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Socket, timeout_usec)
+Socket.SocketUser, config_parse_user_group_compat, 0, offsetof(Socket, user)
+Socket.SocketGroup, config_parse_user_group_compat, 0, offsetof(Socket, group)
+Socket.SocketMode, config_parse_mode, 0, offsetof(Socket, socket_mode)
+Socket.DirectoryMode, config_parse_mode, 0, offsetof(Socket, directory_mode)
+Socket.Accept, config_parse_bool, 0, offsetof(Socket, accept)
+Socket.FlushPending, config_parse_bool, 0, offsetof(Socket, flush_pending)
+Socket.Writable, config_parse_bool, 0, offsetof(Socket, writable)
+Socket.MaxConnections, config_parse_unsigned, 0, offsetof(Socket, max_connections)
+Socket.MaxConnectionsPerSource, config_parse_unsigned, 0, offsetof(Socket, max_connections_per_source)
+Socket.KeepAlive, config_parse_bool, 0, offsetof(Socket, keep_alive)
+Socket.KeepAliveTimeSec, config_parse_sec, 0, offsetof(Socket, keep_alive_time)
+Socket.KeepAliveIntervalSec, config_parse_sec, 0, offsetof(Socket, keep_alive_interval)
+Socket.KeepAliveProbes, config_parse_unsigned, 0, offsetof(Socket, keep_alive_cnt)
+Socket.DeferAcceptSec, config_parse_sec, 0, offsetof(Socket, defer_accept)
+Socket.NoDelay, config_parse_bool, 0, offsetof(Socket, no_delay)
+Socket.Priority, config_parse_int, 0, offsetof(Socket, priority)
+Socket.ReceiveBuffer, config_parse_iec_size, 0, offsetof(Socket, receive_buffer)
+Socket.SendBuffer, config_parse_iec_size, 0, offsetof(Socket, send_buffer)
+Socket.IPTOS, config_parse_ip_tos, 0, offsetof(Socket, ip_tos)
+Socket.IPTTL, config_parse_int, 0, offsetof(Socket, ip_ttl)
+Socket.Mark, config_parse_int, 0, offsetof(Socket, mark)
+Socket.PipeSize, config_parse_iec_size, 0, offsetof(Socket, pipe_size)
+Socket.FreeBind, config_parse_bool, 0, offsetof(Socket, free_bind)
+Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent)
+Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast)
+Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred)
+Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec)
+Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo)
+Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping)
+Socket.TCPCongestion, config_parse_string, 0, offsetof(Socket, tcp_congestion)
+Socket.ReusePort, config_parse_bool, 0, offsetof(Socket, reuse_port)
+Socket.MessageQueueMaxMessages, config_parse_long, 0, offsetof(Socket, mq_maxmsg)
+Socket.MessageQueueMessageSize, config_parse_long, 0, offsetof(Socket, mq_msgsize)
+Socket.RemoveOnStop, config_parse_bool, 0, offsetof(Socket, remove_on_stop)
+Socket.Symlinks, config_parse_unit_path_strv_printf, 0, offsetof(Socket, symlinks)
+Socket.FileDescriptorName, config_parse_fdname, 0, 0
+Socket.Service, config_parse_socket_service, 0, 0
+Socket.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, trigger_limit.interval)
+Socket.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Socket, trigger_limit.burst)
+{% if ENABLE_SMACK %}
+Socket.SmackLabel, config_parse_unit_string_printf, 0, offsetof(Socket, smack)
+Socket.SmackLabelIPIn, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_in)
+Socket.SmackLabelIPOut, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_out)
+{% else %}
+Socket.SmackLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+Socket.SmackLabelIPIn, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+Socket.SmackLabelIPOut, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if HAVE_SELINUX %}
+Socket.SELinuxContextFromNet, config_parse_bool, 0, offsetof(Socket, selinux_context_from_net)
+{% else %}
+Socket.SELinuxContextFromNet, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }}
+Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what)
+Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where)
+Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options)
+Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype)
+Mount.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Mount, timeout_usec)
+Mount.DirectoryMode, config_parse_mode, 0, offsetof(Mount, directory_mode)
+Mount.SloppyOptions, config_parse_bool, 0, offsetof(Mount, sloppy_options)
+Mount.LazyUnmount, config_parse_bool, 0, offsetof(Mount, lazy_unmount)
+Mount.ForceUnmount, config_parse_bool, 0, offsetof(Mount, force_unmount)
+Mount.ReadWriteOnly, config_parse_bool, 0, offsetof(Mount, read_write_only)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Mount') }}
+Automount.Where, config_parse_unit_path_printf, 0, offsetof(Automount, where)
+Automount.ExtraOptions, config_parse_unit_string_printf, 0, offsetof(Automount, extra_options)
+Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode)
+Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec)
+Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what)
+Swap.Priority, config_parse_swap_priority, 0, 0
+Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options)
+Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Swap') }}
+Timer.OnCalendar, config_parse_timer, TIMER_CALENDAR, 0
+Timer.OnActiveSec, config_parse_timer, TIMER_ACTIVE, 0
+Timer.OnBootSec, config_parse_timer, TIMER_BOOT, 0
+Timer.OnStartupSec, config_parse_timer, TIMER_STARTUP, 0
+Timer.OnUnitActiveSec, config_parse_timer, TIMER_UNIT_ACTIVE, 0
+Timer.OnUnitInactiveSec, config_parse_timer, TIMER_UNIT_INACTIVE, 0
+Timer.OnClockChange, config_parse_bool, 0, offsetof(Timer, on_clock_change)
+Timer.OnTimezoneChange, config_parse_bool, 0, offsetof(Timer, on_timezone_change)
+Timer.Persistent, config_parse_bool, 0, offsetof(Timer, persistent)
+Timer.WakeSystem, config_parse_bool, 0, offsetof(Timer, wake_system)
+Timer.RemainAfterElapse, config_parse_bool, 0, offsetof(Timer, remain_after_elapse)
+Timer.FixedRandomDelay, config_parse_bool, 0, offsetof(Timer, fixed_random_delay)
+Timer.AccuracySec, config_parse_sec, 0, offsetof(Timer, accuracy_usec)
+Timer.RandomizedDelaySec, config_parse_sec, 0, offsetof(Timer, random_usec)
+Timer.Unit, config_parse_trigger_unit, 0, 0
+Path.PathExists, config_parse_path_spec, 0, 0
+Path.PathExistsGlob, config_parse_path_spec, 0, 0
+Path.PathChanged, config_parse_path_spec, 0, 0
+Path.PathModified, config_parse_path_spec, 0, 0
+Path.DirectoryNotEmpty, config_parse_path_spec, 0, 0
+Path.Unit, config_parse_trigger_unit, 0, 0
+Path.MakeDirectory, config_parse_bool, 0, offsetof(Path, make_directory)
+Path.DirectoryMode, config_parse_mode, 0, offsetof(Path, directory_mode)
+Path.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Path, trigger_limit.interval)
+Path.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Path, trigger_limit.burst)
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Slice') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Scope') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Scope') }}
+Scope.RuntimeMaxSec, config_parse_sec, 0, offsetof(Scope, runtime_max_usec)
+Scope.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Scope, runtime_rand_extra_usec)
+Scope.TimeoutStopSec, config_parse_sec, 0, offsetof(Scope, timeout_stop_usec)
+Scope.OOMPolicy, config_parse_oom_policy, 0, offsetof(Scope, oom_policy)
+{# The [Install] section is ignored here #}
+Install.Alias, NULL, 0, 0
+Install.WantedBy, NULL, 0, 0
+Install.RequiredBy, NULL, 0, 0
+Install.Also, NULL, 0, 0
+Install.DefaultInstance, NULL, 0, 0
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
new file mode 100644
index 0000000..1001faa
--- /dev/null
+++ b/src/core/load-fragment.c
@@ -0,0 +1,6505 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2012 Holger Hans Peter Freyther
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <linux/oom.h>
+#if HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+#include <sched.h>
+#include <sys/resource.h>
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-lsm.h"
+#include "bpf-program.h"
+#include "bpf-socket-bind.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "conf-parser.h"
+#include "core-varlink.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "ioprio-util.h"
+#include "ip-protocol-list.h"
+#include "journal-file.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-netlink.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "time-util.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+static int parse_socket_protocol(const char *s) {
+ int r;
+
+ r = parse_ip_protocol(s);
+ if (r < 0)
+ return r;
+ if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP))
+ return -EPROTONOSUPPORT;
+
+ return r;
+}
+
+int parse_crash_chvt(const char *value, int *data) {
+ int b;
+
+ if (safe_atoi(value, data) >= 0)
+ return 0;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ return b;
+
+ if (b > 0)
+ *data = 0; /* switch to where kmsg goes */
+ else
+ *data = -1; /* turn off switching */
+
+ return 0;
+}
+
+int parse_confirm_spawn(const char *value, char **console) {
+ char *s;
+ int r;
+
+ r = value ? parse_boolean(value) : 1;
+ if (r == 0) {
+ *console = NULL;
+ return 0;
+ } else if (r > 0) /* on with default tty */
+ s = strdup("/dev/console");
+ else if (is_path(value)) /* on with fully qualified path */
+ s = strdup(value);
+ else /* on with only a tty file name, not a fully qualified path */
+ s = path_join("/dev/", value);
+ if (!s)
+ return -ENOMEM;
+
+ *console = s;
+ return 0;
+}
+
+DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol");
+DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse runtime directory preserve mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_exit_type, service_exit_type, ServiceExitType, "Failed to parse service exit type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
+DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_cpu_weight, cg_cpu_weight_parse, uint64_t, "Invalid CPU weight");
+static DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares_internal, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
+DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat, "Failed to parse status unit format");
+DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping, "Failed to parse timestamping precision");
+
+int config_parse_cpu_shares(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use CPUWeight= instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ return config_parse_cpu_shares_internal(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+bool contains_instance_specifier_superset(const char *s) {
+ const char *p, *q;
+ bool percent = false;
+
+ assert(s);
+
+ p = strchr(s, '@');
+ if (!p)
+ return false;
+
+ p++; /* Skip '@' */
+
+ q = strrchr(p, '.');
+ if (!q)
+ q = p + strlen(p);
+
+ /* If the string is just the instance specifier, it's not a superset of the instance. */
+ if (memcmp_nn(p, q - p, "%i", strlen("%i")) == 0)
+ return false;
+
+ /* %i, %n and %N all expand to the instance or a superset of it. */
+ for (; p < q; p++) {
+ if (*p == '%')
+ percent = !percent;
+ else if (percent) {
+ if (IN_SET(*p, 'n', 'N', 'i'))
+ return true;
+ percent = false;
+ }
+ }
+
+ return false;
+}
+
+/* `name` is the rendered version of `format` via `unit_printf` or similar functions. */
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format) {
+ const char *fragment_path;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ /* If a template unit has a direct dependency on itself that includes the unit instance as part of
+ * the template instance via a unit specifier (%i, %n or %N), this will almost certainly lead to
+ * infinite recursion as systemd will keep instantiating new instances of the template unit.
+ * https://github.com/systemd/systemd/issues/17602 shows a good example of how this can happen in
+ * practice. To guard against this, we check for templates that depend on themselves and have the
+ * instantiated unit instance included as part of the template instance of the dependency via a
+ * specifier.
+ *
+ * For example, if systemd-notify@.service depends on systemd-notify@%n.service, this will result in
+ * infinite recursion.
+ */
+
+ if (!unit_name_is_valid(name, UNIT_NAME_INSTANCE))
+ return false;
+
+ if (!unit_name_prefix_equal(u->id, name))
+ return false;
+
+ if (u->type != unit_name_to_type(name))
+ return false;
+
+ r = unit_file_find_fragment(u->manager->unit_id_map, u->manager->unit_name_map, name, &fragment_path, NULL);
+ if (r < 0)
+ return r;
+
+ /* Fragment paths should also be equal as a custom fragment for a specific template instance
+ * wouldn't necessarily lead to infinite recursion. */
+ if (!path_equal_ptr(u->fragment_path, fragment_path))
+ return false;
+
+ if (!contains_instance_specifier_superset(format))
+ return false;
+
+ return true;
+}
+
+int config_parse_unit_deps(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ UnitDependency d = ltype;
+ Unit *u = userdata;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ int r;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_name_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ r = unit_is_likely_recursive_template_dependency(u, k, word);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to determine if '%s' is a recursive dependency, ignoring: %m", k);
+ continue;
+ }
+ if (r > 0) {
+ log_syntax(unit, LOG_DEBUG, filename, line, 0,
+ "Dropping dependency %s=%s that likely leads to infinite recursion.",
+ unit_dependency_to_string(d), word);
+ continue;
+ }
+
+ r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+ }
+}
+
+int config_parse_obsolete_unit_deps(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype));
+
+ return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_unit_string_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_strv_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = ASSERT_PTR(userdata);
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_path_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+ bool fatal = ltype;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, fatal ? "" : ", ignoring");
+ return fatal ? -ENOEXEC : 0;
+ }
+
+ return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_colon_separated_paths(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ char ***sv = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ r = unit_path_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_consume(sv, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_unit_path_strv_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***x = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *x = strv_free(*x);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_consume(x, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+static int patch_var_run(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue,
+ char **path) {
+
+ const char *e;
+ char *z;
+
+ e = path_startswith(*path, "/var/run/");
+ if (!e)
+ return 0;
+
+ z = path_join("/run/", e);
+ if (!z)
+ return log_oom();
+
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "%s= references a path below legacy directory /var/run/, updating %s → %s; "
+ "please update the unit file accordingly.", lvalue, *path, z);
+
+ free_and_replace(*path, z);
+
+ return 1;
+}
+
+int config_parse_socket_listen(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ SocketPort *p = NULL;
+ SocketPort *tail;
+ Socket *s;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ s = SOCKET(data);
+
+ if (isempty(rvalue)) {
+ /* An empty assignment removes all ports */
+ socket_free_ports(s);
+ return 0;
+ }
+
+ p = new0(SocketPort, 1);
+ if (!p)
+ return log_oom();
+
+ if (ltype != SOCKET_SOCKET) {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (ltype == SOCKET_FIFO) {
+ r = patch_var_run(unit, filename, line, lvalue, &k);
+ if (r < 0)
+ return r;
+ }
+
+ free_and_replace(p->path, k);
+ p->type = ltype;
+
+ } else if (streq(lvalue, "ListenNetlink")) {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = socket_address_parse_netlink(&p->address, k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+ return 0;
+ }
+
+ p->type = SOCKET_SOCKET;
+
+ } else {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */
+ r = patch_var_run(unit, filename, line, lvalue, &k);
+ if (r < 0)
+ return r;
+ }
+
+ r = socket_address_parse_and_warn(&p->address, k);
+ if (r < 0) {
+ if (r != -EAFNOSUPPORT)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+ return 0;
+ }
+
+ if (streq(lvalue, "ListenStream"))
+ p->address.type = SOCK_STREAM;
+ else if (streq(lvalue, "ListenDatagram"))
+ p->address.type = SOCK_DGRAM;
+ else {
+ assert(streq(lvalue, "ListenSequentialPacket"));
+ p->address.type = SOCK_SEQPACKET;
+ }
+
+ if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Address family not supported, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ p->type = SOCKET_SOCKET;
+ }
+
+ p->fd = -1;
+ p->auxiliary_fds = NULL;
+ p->n_auxiliary_fds = 0;
+ p->socket = s;
+
+ LIST_FIND_TAIL(port, s->ports, tail);
+ LIST_INSERT_AFTER(port, s->ports, tail, p);
+
+ p = NULL;
+
+ return 0;
+}
+
+int config_parse_exec_nice(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int priority, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->nice_set = false;
+ return 0;
+ }
+
+ r = parse_nice(rvalue, &priority);
+ if (r < 0) {
+ if (r == -ERANGE)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->nice = priority;
+ c->nice_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_oom_score_adjust(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int oa, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->oom_score_adjust_set = false;
+ return 0;
+ }
+
+ r = parse_oom_score_adjust(rvalue, &oa);
+ if (r < 0) {
+ if (r == -ERANGE)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->oom_score_adjust = oa;
+ c->oom_score_adjust_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_coredump_filter(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->coredump_filter = 0;
+ c->coredump_filter_set = false;
+ return 0;
+ }
+
+ uint64_t f;
+ r = coredump_filter_mask_from_string(rvalue, &f);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse the CoredumpFilter=%s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->coredump_filter |= f;
+ c->coredump_filter_set = true;
+ return 0;
+}
+
+int config_parse_kill_mode(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ KillMode *k = data, m;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ *k = KILL_CONTROL_GROUP;
+ return 0;
+ }
+
+ m = kill_mode_from_string(rvalue);
+ if (m < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, m,
+ "Failed to parse kill mode specification, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (m == KILL_NONE)
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses KillMode=none. "
+ "This is unsafe, as it disables systemd's process lifecycle management for the service. "
+ "Please update the service to use a safer KillMode=, such as 'mixed' or 'control-group'. "
+ "Support for KillMode=none is deprecated and will eventually be removed.");
+
+ *k = m;
+ return 0;
+}
+
+int config_parse_exec(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecCommand **e = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ const char *p;
+ bool semicolon;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ e += ltype;
+
+ if (isempty(rvalue)) {
+ /* An empty assignment resets the list */
+ *e = exec_command_free_list(*e);
+ return 0;
+ }
+
+ p = rvalue;
+ do {
+ _cleanup_free_ char *path = NULL, *firstword = NULL;
+ ExecCommandFlags flags = 0;
+ bool ignore = false, separate_argv0 = false;
+ _cleanup_free_ ExecCommand *nce = NULL;
+ _cleanup_strv_free_ char **n = NULL;
+ size_t nlen = 0;
+ const char *f;
+
+ semicolon = false;
+
+ r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+ if (r <= 0)
+ return 0;
+
+ /* A lone ";" is a separator. Let's make sure we don't treat it as an executable name. */
+ if (streq(firstword, ";")) {
+ semicolon = true;
+ continue;
+ }
+
+ f = firstword;
+ for (;;) {
+ /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't
+ * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of
+ * argv[0]; if it's prefixed with :, we will not do environment variable substitution;
+ * if it's prefixed with +, it will be run with full privileges and no sandboxing; if
+ * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if
+ * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient
+ * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most
+ * other sandboxing, with some special exceptions for changing UID.
+ *
+ * The idea is that '!!' may be used to write services that can take benefit of systemd's
+ * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to
+ * privilege dropping within the daemon if the kernel does not offer that. */
+
+ if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ flags |= EXEC_COMMAND_IGNORE_FAILURE;
+ ignore = true;
+ } else if (*f == '@' && !separate_argv0)
+ separate_argv0 = true;
+ else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND))
+ flags |= EXEC_COMMAND_NO_ENV_EXPAND;
+ else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
+ else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ flags |= EXEC_COMMAND_NO_SETUID;
+ else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) {
+ flags &= ~EXEC_COMMAND_NO_SETUID;
+ flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+ } else
+ break;
+ f++;
+ }
+
+ r = unit_path_printf(u, f, &path);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ f, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (isempty(path)) {
+ /* First word is either "-" or "@" with no command. */
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Empty path in command line%s: '%s'",
+ ignore ? ", ignoring" : "", rvalue);
+ return ignore ? 0 : -ENOEXEC;
+ }
+ if (!string_is_safe(path)) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Executable name contains special characters%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+ if (endswith(path, "/")) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Executable path specifies a directory%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!(path_is_absolute(path) ? path_is_valid(path) : filename_is_valid(path))) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Neither a valid executable name nor an absolute path%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!separate_argv0) {
+ char *w = NULL;
+
+ if (!GREEDY_REALLOC0(n, nlen + 2))
+ return log_oom();
+
+ w = strdup(path);
+ if (!w)
+ return log_oom();
+ n[nlen++] = w;
+ n[nlen] = NULL;
+ }
+
+ path_simplify(path);
+
+ while (!isempty(p)) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ /* Check explicitly for an unquoted semicolon as
+ * command separator token. */
+ if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) {
+ p++;
+ p += strspn(p, WHITESPACE);
+ semicolon = true;
+ break;
+ }
+
+ /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc.
+ * extract_first_word() would return the same for all of those. */
+ if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) {
+ char *w;
+
+ p += 2;
+ p += strspn(p, WHITESPACE);
+
+ if (!GREEDY_REALLOC0(n, nlen + 2))
+ return log_oom();
+
+ w = strdup(";");
+ if (!w)
+ return log_oom();
+ n[nlen++] = w;
+ n[nlen] = NULL;
+ continue;
+ }
+
+ r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+ if (r == 0)
+ break;
+ if (r < 0)
+ return ignore ? 0 : -ENOEXEC;
+
+ r = unit_full_printf(u, word, &resolved);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in %s%s: %m",
+ word, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(resolved);
+ n[nlen] = NULL;
+ }
+
+ if (!n || !n[0]) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Empty executable name or zeroeth argument%s: %s",
+ ignore ? ", ignoring" : "", rvalue);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ nce = new0(ExecCommand, 1);
+ if (!nce)
+ return log_oom();
+
+ nce->argv = TAKE_PTR(n);
+ nce->path = TAKE_PTR(path);
+ nce->flags = flags;
+
+ exec_command_append_list(e, nce);
+
+ /* Do not _cleanup_free_ these. */
+ nce = NULL;
+
+ rvalue = p;
+ } while (semicolon);
+
+ return 0;
+}
+
+int config_parse_socket_bindtodevice(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Socket *s = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue) || streq(rvalue, "*")) {
+ s->bind_to_device = mfree(s->bind_to_device);
+ return 0;
+ }
+
+ if (!ifname_valid(rvalue)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&s->bind_to_device, rvalue);
+}
+
+int config_parse_exec_input(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ const char *n;
+ ExecInput ei;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ n = startswith(rvalue, "fd:");
+ if (n) {
+ _cleanup_free_ char *resolved = NULL;
+
+ r = unit_fd_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+ return 0;
+ }
+
+ if (isempty(resolved))
+ resolved = mfree(resolved);
+ else if (!fdname_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+ return 0;
+ }
+
+ free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved);
+
+ ei = EXEC_INPUT_NAMED_FD;
+
+ } else if ((n = startswith(rvalue, "file:"))) {
+ _cleanup_free_ char *resolved = NULL;
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ free_and_replace(c->stdio_file[STDIN_FILENO], resolved);
+
+ ei = EXEC_INPUT_FILE;
+
+ } else {
+ ei = exec_input_from_string(rvalue);
+ if (ei < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, ei, "Failed to parse input specifier, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ c->std_input = ei;
+ return 0;
+}
+
+int config_parse_exec_input_text(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *unescaped = NULL, *resolved = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+ return 0;
+ }
+
+ ssize_t l = cunescape(rvalue, 0, &unescaped);
+ if (l < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, l,
+ "Failed to decode C escaped text '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, unescaped, EXEC_STDIN_DATA_MAX, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", unescaped);
+ return 0;
+ }
+
+ size_t sz = strlen(resolved);
+ if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+ c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+ return 0;
+ }
+
+ void *p = realloc(c->stdin_data, c->stdin_data_size + sz + 1);
+ if (!p)
+ return log_oom();
+
+ *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n';
+
+ c->stdin_data = p;
+ c->stdin_data_size += sz + 1;
+
+ return 0;
+}
+
+int config_parse_exec_input_data(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *p = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t sz;
+ void *q;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+ return 0;
+ }
+
+ r = unbase64mem(rvalue, SIZE_MAX, &p, &sz);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to decode base64 data, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ assert(sz > 0);
+
+ if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+ c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+ return 0;
+ }
+
+ q = realloc(c->stdin_data, c->stdin_data_size + sz);
+ if (!q)
+ return log_oom();
+
+ memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+ c->stdin_data = q;
+ c->stdin_data_size += sz;
+
+ return 0;
+}
+
+int config_parse_exec_output(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *resolved = NULL;
+ const char *n;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool obsolete = false;
+ ExecOutput eo;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(lvalue);
+ assert(rvalue);
+
+ n = startswith(rvalue, "fd:");
+ if (n) {
+ r = unit_fd_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n);
+ return 0;
+ }
+
+ if (isempty(resolved))
+ resolved = mfree(resolved);
+ else if (!fdname_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+ return 0;
+ }
+
+ eo = EXEC_OUTPUT_NAMED_FD;
+
+ } else if (streq(rvalue, "syslog")) {
+ eo = EXEC_OUTPUT_JOURNAL;
+ obsolete = true;
+
+ } else if (streq(rvalue, "syslog+console")) {
+ eo = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+ obsolete = true;
+
+ } else if ((n = startswith(rvalue, "file:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE;
+
+ } else if ((n = startswith(rvalue, "append:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE_APPEND;
+
+ } else if ((n = startswith(rvalue, "truncate:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE_TRUNCATE;
+ } else {
+ eo = exec_output_from_string(rvalue);
+ if (eo < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, eo, "Failed to parse output specifier, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ if (obsolete)
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "Standard output type %s is obsolete, automatically updating to %s. Please update your unit file, and consider removing the setting altogether.",
+ rvalue, exec_output_to_string(eo));
+
+ if (streq(lvalue, "StandardOutput")) {
+ if (eo == EXEC_OUTPUT_NAMED_FD)
+ free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved);
+ else
+ free_and_replace(c->stdio_file[STDOUT_FILENO], resolved);
+
+ c->std_output = eo;
+
+ } else {
+ assert(streq(lvalue, "StandardError"));
+
+ if (eo == EXEC_OUTPUT_NAMED_FD)
+ free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved);
+ else
+ free_and_replace(c->stdio_file[STDERR_FILENO], resolved);
+
+ c->std_error = eo;
+ }
+
+ return 0;
+}
+
+int config_parse_exec_io_class(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->ioprio_set = false;
+ c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+ return 0;
+ }
+
+ x = ioprio_class_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse IO scheduling class, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(x, ioprio_prio_data(c->ioprio)));
+ c->ioprio_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_io_priority(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int i, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->ioprio_set = false;
+ c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+ return 0;
+ }
+
+ r = ioprio_parse_priority(rvalue, &i);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), i));
+ c->ioprio_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_cpu_sched_policy(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->cpu_sched_set = false;
+ c->cpu_sched_policy = SCHED_OTHER;
+ c->cpu_sched_priority = 0;
+ return 0;
+ }
+
+ x = sched_policy_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->cpu_sched_policy = x;
+ /* Moving to or from real-time policy? We need to adjust the priority */
+ c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x));
+ c->cpu_sched_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_mount_apivfs(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int k;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->mount_apivfs_set = false;
+ c->mount_apivfs = false;
+ return 0;
+ }
+
+ k = parse_boolean(rvalue);
+ if (k < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, k,
+ "Failed to parse boolean value, ignoring: %s",
+ rvalue);
+ return 0;
+ }
+
+ c->mount_apivfs_set = true;
+ c->mount_apivfs = k;
+ return 0;
+}
+
+int config_parse_numa_mask(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ int r;
+ NUMAPolicy *p = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "all")) {
+ r = numa_mask_add_all(&p->nodes);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to create NUMA mask representing \"all\" NUMA nodes, ignoring: %m");
+ } else {
+ r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
+ }
+
+ return 0;
+}
+
+int config_parse_exec_cpu_sched_prio(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int i, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = safe_atoi(rvalue, &i);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set later so
+ * we do not check the precise range, but only the generic outer bounds. */
+ if (i < 0 || i > 99) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->cpu_sched_priority = i;
+ c->cpu_sched_set = true;
+
+ return 0;
+}
+
+int config_parse_root_image_options(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ return 0;
+ }
+
+ r = strv_split_colon_pairs(&l, rvalue);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ STRV_FOREACH_PAIR(first, second, l) {
+ MountOptions *o = NULL;
+ _cleanup_free_ char *mount_options_resolved = NULL;
+ const char *mount_options = NULL, *partition = "root";
+ PartitionDesignator partition_designator;
+
+ /* Format is either 'root:foo' or 'foo' (root is implied) */
+ if (!isempty(*second)) {
+ partition = *first;
+ mount_options = *second;
+ } else
+ mount_options = *first;
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+ "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+
+ if (options)
+ LIST_JOIN(mount_options, c->root_image_options, options);
+ else
+ /* empty spaces/separators only */
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+
+ return 0;
+}
+
+int config_parse_exec_root_hash(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *roothash_decoded = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t roothash_decoded_size = 0;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ return 0;
+ }
+
+ if (path_is_absolute(rvalue)) {
+ /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */
+ _cleanup_free_ char *p = NULL;
+
+ p = strdup(rvalue);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash_path, p);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ return 0;
+ }
+
+ /* We have a roothash to decode, eg: RootHash=012345789abcdef */
+ r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (roothash_decoded_size < sizeof(sd_id128_t)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "RootHash= is too short, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ free_and_replace(c->root_hash, roothash_decoded);
+ c->root_hash_size = roothash_decoded_size;
+ c->root_hash_path = mfree(c->root_hash_path);
+
+ return 0;
+}
+
+int config_parse_exec_root_hash_sig(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *roothash_sig_decoded = NULL;
+ char *value;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t roothash_sig_decoded_size = 0;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ return 0;
+ }
+
+ if (path_is_absolute(rvalue)) {
+ /* We have the path to a roothash signature to load and decode, eg: RootHashSignature=/foo/bar.roothash.p7s */
+ _cleanup_free_ char *p = NULL;
+
+ p = strdup(rvalue);
+ if (!p)
+ return log_oom();
+
+ free_and_replace(c->root_hash_sig_path, p);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ return 0;
+ }
+
+ if (!(value = startswith(rvalue, "base64:"))) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Failed to decode RootHashSignature=, not a path but doesn't start with 'base64:', ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */
+ r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ free_and_replace(c->root_hash_sig, roothash_sig_decoded);
+ c->root_hash_sig_size = roothash_sig_decoded_size;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+ return 0;
+}
+
+int config_parse_exec_cpu_affinity(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "numa")) {
+ c->cpu_affinity_from_numa = true;
+ cpu_set_reset(&c->cpu_set);
+
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m",
+ rvalue);
+ return 0;
+ }
+
+ r = parse_cpu_set_extend(k, &c->cpu_set, true, unit, filename, line, lvalue);
+ if (r >= 0)
+ c->cpu_affinity_from_numa = false;
+
+ return 0;
+}
+
+int config_parse_capability_set(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint64_t *capability_set = ASSERT_PTR(data);
+ uint64_t sum = 0, initial = 0;
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (streq(lvalue, "CapabilityBoundingSet"))
+ initial = CAP_ALL; /* initialized to all bits on */
+ /* else "AmbientCapabilities" initialized to all bits off */
+
+ r = capability_set_from_string(rvalue, &sum);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+
+ if (sum == 0 || *capability_set == initial)
+ /* "", "~" or uninitialized data -> replace */
+ *capability_set = invert ? ~sum : sum;
+ else {
+ /* previous data -> merge */
+ if (invert)
+ *capability_set &= ~sum;
+ else
+ *capability_set |= sum;
+ }
+
+ return 0;
+}
+
+int config_parse_exec_selinux_context(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->selinux_context = mfree(c->selinux_context);
+ c->selinux_context_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->selinux_context, k);
+ c->selinux_context_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_exec_apparmor_profile(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->apparmor_profile = mfree(c->apparmor_profile);
+ c->apparmor_profile_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->apparmor_profile, k);
+ c->apparmor_profile_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_exec_smack_process_label(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->smack_process_label = mfree(c->smack_process_label);
+ c->smack_process_label_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->smack_process_label, k);
+ c->smack_process_label_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_timer(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = userdata;
+ Timer *t = ASSERT_PTR(data);
+ usec_t usec = 0;
+ TimerValue *v;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets list */
+ timer_free_values(t);
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (ltype == TIMER_CALENDAR) {
+ r = calendar_spec_from_string(k, &c);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse calendar specification, ignoring: %s", k);
+ return 0;
+ }
+ } else {
+ r = parse_sec(k, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", k);
+ return 0;
+ }
+ }
+
+ v = new(TimerValue, 1);
+ if (!v)
+ return log_oom();
+
+ *v = (TimerValue) {
+ .base = ltype,
+ .value = usec,
+ .calendar_spec = TAKE_PTR(c),
+ };
+
+ LIST_PREPEND(value, t->values, v);
+
+ return 0;
+}
+
+int config_parse_trigger_unit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(data);
+ UnitType type;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (UNIT_TRIGGER(u)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_name_printf(u, rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ type = unit_name_to_type(p);
+ if (type < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, type, "Unit type not valid, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (unit_has_name(u, p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_path_spec(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Path *p = ASSERT_PTR(data);
+ PathSpec *s;
+ PathType b;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment clears list */
+ path_free_specs(p);
+ return 0;
+ }
+
+ b = path_type_from_string(lvalue);
+ if (b < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, b, "Failed to parse path type, ignoring: %s", lvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(UNIT(p), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ s = new0(PathSpec, 1);
+ if (!s)
+ return log_oom();
+
+ s->unit = UNIT(p);
+ s->path = TAKE_PTR(k);
+ s->type = b;
+ s->inotify_fd = -1;
+
+ LIST_PREPEND(spec, p->specs, s);
+
+ return 0;
+}
+
+int config_parse_socket_service(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *p = NULL;
+ Socket *s = ASSERT_PTR(data);
+ Unit *x;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_name_printf(UNIT(s), rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!endswith(p, ".service")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type service, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load unit %s, ignoring: %s", rvalue, bus_error_message(&error, r));
+ return 0;
+ }
+
+ unit_ref_set(&s->service, UNIT(s), x);
+
+ return 0;
+}
+
+int config_parse_fdname(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Socket *s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ s->fdname = mfree(s->fdname);
+ return 0;
+ }
+
+ r = unit_fd_printf(UNIT(s), rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!fdname_is_valid(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p);
+ return 0;
+ }
+
+ return free_and_replace(s->fdname, p);
+}
+
+int config_parse_service_sockets(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_name_printf(UNIT(s), word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ if (!endswith(k, ".socket")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type socket, ignoring: %s", k);
+ continue;
+ }
+
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+ }
+}
+
+int config_parse_bus_name(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf_full(u, rvalue, SD_BUS_MAXIMUM_NAME_LENGTH, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!sd_bus_service_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid bus name, ignoring: %s", k);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_service_timeout(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(userdata);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */
+
+ /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens
+ * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle
+ * all other timeouts. */
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ s->start_timeout_defined = true;
+ s->timeout_start_usec = usec;
+
+ if (streq(lvalue, "TimeoutSec"))
+ s->timeout_stop_usec = usec;
+
+ return 0;
+}
+
+int config_parse_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ usec_t *ret = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* Note: apart from setting the arg, this returns an extra bit of information in the return value. */
+
+ if (isempty(rvalue)) {
+ *ret = 0;
+ return 0; /* "not set" */
+ }
+
+ r = parse_sec(rvalue, ret);
+ if (r < 0)
+ return log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= setting, ignoring: %s", lvalue, rvalue);
+
+ return 1; /* "set" */
+}
+
+int config_parse_service_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(userdata);
+ int r;
+
+ r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
+ &s->timeout_abort_usec, s);
+ if (r >= 0)
+ s->timeout_abort_set = r;
+ return 0;
+}
+
+int config_parse_user_group_compat(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ char **user = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *user = mfree(*user);
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue);
+ return -ENOEXEC;
+ }
+
+ if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+ log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+ return -ENOEXEC;
+ }
+
+ if (strstr(lvalue, "User") && streq(k, NOBODY_USER_NAME))
+ log_struct(LOG_NOTICE,
+ "MESSAGE=%s:%u: Special user %s configured, this is not safe!", filename, line, k,
+ "UNIT=%s", unit,
+ "MESSAGE_ID=" SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR,
+ "OFFENDING_USER=%s", k,
+ "CONFIG_FILE=%s", filename,
+ "CONFIG_LINE=%u", line);
+
+ return free_and_replace(*user, k);
+}
+
+int config_parse_user_group_strv_compat(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***users = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *users = strv_free(*users);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue);
+ return -ENOEXEC;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_full_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word);
+ return -ENOEXEC;
+ }
+
+ if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+ log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+ return -ENOEXEC;
+ }
+
+ r = strv_push(users, k);
+ if (r < 0)
+ return log_oom();
+
+ k = NULL;
+ }
+}
+
+int config_parse_working_directory(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = ASSERT_PTR(userdata);
+ bool missing_ok;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->working_directory_home = false;
+ c->working_directory = mfree(c->working_directory);
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ missing_ok = true;
+ rvalue++;
+ } else
+ missing_ok = false;
+
+ if (streq(rvalue, "~")) {
+ c->working_directory_home = true;
+ c->working_directory = mfree(c->working_directory);
+ } else {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, missing_ok ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in working directory path '%s'%s: %m",
+ rvalue, missing_ok ? ", ignoring" : "");
+ return missing_ok ? 0 : -ENOEXEC;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue);
+ if (r < 0)
+ return missing_ok ? 0 : -ENOEXEC;
+
+ c->working_directory_home = false;
+ free_and_replace(c->working_directory, k);
+ }
+
+ c->working_directory_missing_ok = missing_ok;
+ return 0;
+}
+
+int config_parse_unit_env_file(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***env = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ _cleanup_free_ char *n = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment frees the list */
+ *env = strv_free(*env);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, rvalue, PATH_MAX, &n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_push(env, n);
+ if (r < 0)
+ return log_oom();
+
+ n = NULL;
+
+ return 0;
+}
+
+int config_parse_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ char ***env = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *env = strv_free(*env);
+ return 0;
+ }
+
+ for (const char *p = rvalue;; ) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ if (u)
+ r = unit_env_printf(u, word, &resolved);
+ else
+ r = specifier_printf(word, sc_arg_max(), system_and_tmp_specifier_table, NULL, NULL, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+
+ if (!env_assignment_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment assignment, ignoring: %s", resolved);
+ continue;
+ }
+
+ r = strv_env_replace_consume(env, TAKE_PTR(resolved));
+ if (r < 0)
+ return log_error_errno(r, "Failed to update environment: %m");
+ }
+}
+
+int config_parse_pass_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **n = NULL;
+ const Unit *u = userdata;
+ char*** passenv = ASSERT_PTR(data);
+ size_t nlen = 0;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *passenv = strv_free(*passenv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+ if (r == 0)
+ break;
+
+ if (u) {
+ r = unit_env_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+ } else
+ k = TAKE_PTR(word);
+
+ if (!env_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment name for %s, ignoring: %s", lvalue, k);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(k);
+ n[nlen] = NULL;
+ }
+
+ if (n) {
+ r = strv_extend_strv(passenv, n, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_unset_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **n = NULL;
+ char*** unsetenv = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ size_t nlen = 0;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *unsetenv = strv_free(*unsetenv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+ if (r == 0)
+ break;
+
+ if (u) {
+ r = unit_env_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+ } else
+ k = TAKE_PTR(word);
+
+ if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment name or assignment %s, ignoring: %s", lvalue, k);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(k);
+ n[nlen] = NULL;
+ }
+
+ if (n) {
+ r = strv_extend_strv(unsetenv, n, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_log_extra_fields(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ exec_context_free_log_extra_fields(c);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ struct iovec *t;
+ const char *eq;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_full_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+
+ eq = strchr(k, '=');
+ if (!eq) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k);
+ continue;
+ }
+
+ if (!journal_field_valid(k, eq-k, false)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field name is invalid, ignoring: %s", k);
+ continue;
+ }
+
+ t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+ if (!t)
+ return log_oom();
+
+ c->log_extra_fields = t;
+ c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k);
+
+ k = NULL;
+ }
+}
+
+int config_parse_log_namespace(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->log_namespace = mfree(c->log_namespace);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, rvalue, NAME_MAX, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!log_namespace_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Specified log namespace name is not valid, ignoring: %s", k);
+ return 0;
+ }
+
+ free_and_replace(c->log_namespace, k);
+ return 0;
+}
+
+int config_parse_unit_condition_path(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Condition **list = ASSERT_PTR(data), *c;
+ ConditionType t = ltype;
+ bool trigger, negate;
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *list = condition_free_list(*list);
+ return 0;
+ }
+
+ trigger = rvalue[0] == '|';
+ if (trigger)
+ rvalue++;
+
+ negate = rvalue[0] == '!';
+ if (negate)
+ rvalue++;
+
+ r = unit_path_printf(u, rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ c = condition_new(t, p, trigger, negate);
+ if (!c)
+ return log_oom();
+
+ LIST_PREPEND(conditions, *list, c);
+ return 0;
+}
+
+int config_parse_unit_condition_string(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *s = NULL;
+ Condition **list = ASSERT_PTR(data), *c;
+ ConditionType t = ltype;
+ bool trigger, negate;
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *list = condition_free_list(*list);
+ return 0;
+ }
+
+ trigger = *rvalue == '|';
+ if (trigger)
+ rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+ negate = *rvalue == '!';
+ if (negate)
+ rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+ r = unit_full_printf(u, rvalue, &s);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c = condition_new(t, s, trigger, negate);
+ if (!c)
+ return log_oom();
+
+ LIST_PREPEND(conditions, *list, c);
+ return 0;
+}
+
+int config_parse_unit_requires_mounts_for(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_path_printf(u, word, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved);
+ continue;
+ }
+ }
+}
+
+int config_parse_documentation(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+ char **a, **b;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ u->documentation = strv_free(u->documentation);
+ return 0;
+ }
+
+ r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype,
+ rvalue, data, userdata);
+ if (r < 0)
+ return r;
+
+ for (a = b = u->documentation; a && *a; a++) {
+
+ if (documentation_url_is_valid(*a))
+ *(b++) = *a;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid URL, ignoring: %s", *a);
+ free(*a);
+ }
+ }
+ if (b)
+ *b = NULL;
+
+ return 0;
+}
+
+#if HAVE_SECCOMP
+int config_parse_syscall_filter(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ _unused_ const Unit *u = ASSERT_PTR(userdata);
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+ c->syscall_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->syscall_filter) {
+ c->syscall_filter = hashmap_new(NULL);
+ if (!c->syscall_filter)
+ return log_oom();
+
+ if (invert)
+ /* Allow everything but the ones listed */
+ c->syscall_allow_list = false;
+ else {
+ /* Allow nothing but the ones listed */
+ c->syscall_allow_list = true;
+
+ /* Accept default syscalls if we are on an allow_list */
+ r = seccomp_parse_syscall_filter(
+ "@default", -1, c->syscall_filter,
+ SECCOMP_PARSE_PERMISSIVE|SECCOMP_PARSE_ALLOW_LIST,
+ unit,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *name = NULL;
+ int num;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = parse_syscall_and_errno(word, &name, &num);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse syscall:errno, ignoring: %s", word);
+ continue;
+ }
+ if (!invert && num >= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Allow-listed system calls cannot take error number, ignoring: %s", word);
+ continue;
+ }
+
+ r = seccomp_parse_syscall_filter(
+ name, num, c->syscall_filter,
+ SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+ (invert ? SECCOMP_PARSE_INVERT : 0)|
+ (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+}
+
+int config_parse_syscall_log(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ _unused_ const Unit *u = ASSERT_PTR(userdata);
+ bool invert = false;
+ const char *p;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->syscall_log = hashmap_free(c->syscall_log);
+ c->syscall_log_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->syscall_log) {
+ c->syscall_log = hashmap_new(NULL);
+ if (!c->syscall_log)
+ return log_oom();
+
+ if (invert)
+ /* Log everything but the ones listed */
+ c->syscall_log_allow_list = false;
+ else
+ /* Log nothing but the ones listed */
+ c->syscall_log_allow_list = true;
+ }
+
+ p = rvalue;
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = seccomp_parse_syscall_filter(
+ word, -1, c->syscall_log,
+ SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+ (invert ? SECCOMP_PARSE_INVERT : 0)|
+ (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+}
+
+int config_parse_syscall_archs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Set **archs = data;
+ int r;
+
+ if (isempty(rvalue)) {
+ *archs = set_free(*archs);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ uint32_t a;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = seccomp_arch_from_string(word, &a);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse system call architecture \"%s\", ignoring: %m", word);
+ continue;
+ }
+
+ r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_syscall_errno(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ int e;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue) || streq(rvalue, "kill")) {
+ /* Empty assignment resets to KILL */
+ c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
+ return 0;
+ }
+
+ e = parse_errno(rvalue);
+ if (e < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, e, "Failed to parse error number, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (e == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid error number, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->syscall_errno = e;
+ return 0;
+}
+
+int config_parse_address_families(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->address_families = set_free(c->address_families);
+ c->address_families_allow_list = false;
+ return 0;
+ }
+
+ if (streq(rvalue, "none")) {
+ /* Forbid all address families. */
+ c->address_families = set_free(c->address_families);
+ c->address_families_allow_list = true;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->address_families) {
+ c->address_families = set_new(NULL);
+ if (!c->address_families)
+ return log_oom();
+
+ c->address_families_allow_list = !invert;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ int af;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ af = af_from_name(word);
+ if (af < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, af,
+ "Failed to parse address family, ignoring: %s", word);
+ continue;
+ }
+
+ /* If we previously wanted to forbid an address family and now
+ * we want to allow it, then just remove it from the list.
+ */
+ if (!invert == c->address_families_allow_list) {
+ r = set_put(c->address_families, INT_TO_PTR(af));
+ if (r < 0)
+ return log_oom();
+ } else
+ set_remove(c->address_families, INT_TO_PTR(af));
+ }
+}
+
+int config_parse_restrict_namespaces(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ unsigned long flags;
+ bool invert = false;
+ int r;
+
+ if (isempty(rvalue)) {
+ /* Reset to the default. */
+ c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
+ return 0;
+ }
+
+ /* Boolean parameter ignores the previous settings */
+ r = parse_boolean(rvalue);
+ if (r > 0) {
+ c->restrict_namespaces = 0;
+ return 0;
+ } else if (r == 0) {
+ c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ /* Not a boolean argument, in this case it's a list of namespace types. */
+ r = namespace_flags_from_string(rvalue, &flags);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL)
+ /* Initial assignment. Just set the value. */
+ c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags;
+ else
+ /* Merge the value with the previous one. */
+ SET_FLAG(c->restrict_namespaces, flags, !invert);
+
+ return 0;
+}
+#endif
+
+int config_parse_restrict_filesystems(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ ExecContext *c = ASSERT_PTR(data);
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+ c->restrict_filesystems_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->restrict_filesystems) {
+ if (invert)
+ /* Allow everything but the ones listed */
+ c->restrict_filesystems_allow_list = false;
+ else
+ /* Allow nothing but the ones listed */
+ c->restrict_filesystems_allow_list = true;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+
+ r = lsm_bpf_parse_filesystem(
+ word,
+ &c->restrict_filesystems,
+ FILESYSTEM_PARSE_LOG|
+ (invert ? FILESYSTEM_PARSE_INVERT : 0)|
+ (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int config_parse_unit_slice(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *k = NULL;
+ Unit *u = userdata, *slice;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(u);
+
+ r = unit_name_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = manager_load_unit(u->manager, k, NULL, &error, &slice);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r));
+ return 0;
+ }
+
+ r = unit_set_slice(u, slice);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_cpu_quota(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->cpu_quota_per_sec_usec = USEC_INFINITY;
+ return 0;
+ }
+
+ r = parse_permyriad_unbounded(rvalue);
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 10000U;
+ return 0;
+}
+
+int config_parse_allowed_cpuset(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CPUSet *c = data;
+ const Unit *u = userdata;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m",
+ rvalue);
+ return 0;
+ }
+
+ (void) parse_cpu_set_extend(k, c, true, unit, filename, line, lvalue);
+ return 0;
+}
+
+int config_parse_memory_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ uint64_t bytes = CGROUP_LIMIT_MAX;
+ int r;
+
+ if (isempty(rvalue) && STR_IN_SET(lvalue, "DefaultMemoryLow",
+ "DefaultMemoryMin",
+ "MemoryLow",
+ "MemoryMin"))
+ bytes = CGROUP_LIMIT_MIN;
+ else if (!isempty(rvalue) && !streq(rvalue, "infinity")) {
+
+ r = parse_permyriad(rvalue);
+ if (r < 0) {
+ r = parse_size(rvalue, 1024, &bytes);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+ } else
+ bytes = physical_memory_scale(r, 10000U);
+
+ if (bytes >= UINT64_MAX ||
+ (bytes <= 0 && !STR_IN_SET(lvalue, "MemorySwapMax", "MemoryLow", "MemoryMin", "DefaultMemoryLow", "DefaultMemoryMin"))) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue);
+ return 0;
+ }
+ }
+
+ if (streq(lvalue, "DefaultMemoryLow")) {
+ c->default_memory_low = bytes;
+ c->default_memory_low_set = true;
+ } else if (streq(lvalue, "DefaultMemoryMin")) {
+ c->default_memory_min = bytes;
+ c->default_memory_min_set = true;
+ } else if (streq(lvalue, "MemoryMin")) {
+ c->memory_min = bytes;
+ c->memory_min_set = true;
+ } else if (streq(lvalue, "MemoryLow")) {
+ c->memory_low = bytes;
+ c->memory_low_set = true;
+ } else if (streq(lvalue, "MemoryHigh"))
+ c->memory_high = bytes;
+ else if (streq(lvalue, "MemoryMax"))
+ c->memory_max = bytes;
+ else if (streq(lvalue, "MemorySwapMax"))
+ c->memory_swap_max = bytes;
+ else if (streq(lvalue, "MemoryLimit")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses MemoryLimit=; please use MemoryMax= instead. Support for MemoryLimit= will be removed soon.");
+ c->memory_limit = bytes;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+int config_parse_tasks_max(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ TasksMax *tasks_max = data;
+ uint64_t v;
+ int r;
+
+ if (isempty(rvalue)) {
+ *tasks_max = u ? u->manager->default_tasks_max : TASKS_MAX_UNSET;
+ return 0;
+ }
+
+ if (streq(rvalue, "infinity")) {
+ *tasks_max = TASKS_MAX_UNSET;
+ return 0;
+ }
+
+ r = parse_permyriad(rvalue);
+ if (r >= 0)
+ *tasks_max = (TasksMax) { r, 10000U }; /* r‱ */
+ else {
+ r = safe_atou64(rvalue, &v);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (v <= 0 || v >= UINT64_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue);
+ return 0;
+ }
+
+ *tasks_max = (TasksMax) { v };
+ }
+
+ return 0;
+}
+
+int config_parse_delegate(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ UnitType t;
+ int r;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_delegate) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring.");
+ return 0;
+ }
+
+ /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or
+ * turn it off for all. Or it takes a list of controller names, in which case we add the specified
+ * controllers to the mask to delegate. Delegate= enables delegation without any controllers. */
+
+ if (isempty(rvalue)) {
+ /* An empty string resets controllers and sets Delegate=yes. */
+ c->delegate = true;
+ c->delegate_controllers = 0;
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ CGroupMask mask = 0;
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ CGroupController cc;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ cc = cgroup_controller_from_string(word);
+ if (cc < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid controller name '%s', ignoring", word);
+ continue;
+ }
+
+ mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+ }
+
+ c->delegate = true;
+ c->delegate_controllers |= mask;
+
+ } else if (r > 0) {
+ c->delegate = true;
+ c->delegate_controllers = _CGROUP_MASK_ALL;
+ } else {
+ c->delegate = false;
+ c->delegate_controllers = 0;
+ }
+
+ return 0;
+}
+
+int config_parse_managed_oom_mode(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ManagedOOMMode *mode = data, m;
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ *mode = MANAGED_OOM_AUTO;
+ return 0;
+ }
+
+ m = managed_oom_mode_from_string(rvalue);
+ if (m < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, m, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *mode = m;
+ return 0;
+}
+
+int config_parse_managed_oom_mem_pressure_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t *limit = data;
+ UnitType t;
+ int r;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ *limit = 0;
+ return 0;
+ }
+
+ r = parse_permyriad(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* Normalize to 2^32-1 == 100% */
+ *limit = UINT32_SCALE_FROM_PERMYRIAD(r);
+ return 0;
+}
+
+int config_parse_device_allow(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupContext *c = data;
+ const char *p = rvalue;
+ int r;
+
+ if (isempty(rvalue)) {
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and rights from '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ if (!STARTSWITH_SET(resolved, "block-", "char-")) {
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (!valid_device_node_path(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved);
+ return 0;
+ }
+ }
+
+ if (!isempty(p) && !in_charset(p, "rwm")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device rights '%s', ignoring.", p);
+ return 0;
+ }
+
+ return cgroup_add_device_allow(c, resolved, p);
+}
+
+int config_parse_io_device_weight(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceWeight *w;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t u;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and weight from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device path or weight specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cg_weight_parse(p, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p);
+ return 0;
+ }
+
+ assert(u != CGROUP_WEIGHT_INVALID);
+
+ w = new0(CGroupIODeviceWeight, 1);
+ if (!w)
+ return log_oom();
+
+ w->path = TAKE_PTR(resolved);
+ w->weight = u;
+
+ LIST_PREPEND(device_weights, c->io_device_weights, w);
+ return 0;
+}
+
+int config_parse_io_device_latency(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceLatency *l;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and latency from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device path or latency specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = parse_sec(p, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", p);
+ return 0;
+ }
+
+ l = new0(CGroupIODeviceLatency, 1);
+ if (!l)
+ return log_oom();
+
+ l->path = TAKE_PTR(resolved);
+ l->target_usec = usec;
+
+ LIST_PREPEND(device_latencies, c->io_device_latencies, l);
+ return 0;
+}
+
+int config_parse_io_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceLimit *l = NULL;
+ CGroupContext *c = data;
+ CGroupIOLimitType type;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t num;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ type = cgroup_io_limit_type_from_string(lvalue);
+ assert(type >= 0);
+
+ if (isempty(rvalue)) {
+ LIST_FOREACH(device_limits, t, c->io_device_limits)
+ t->limits[type] = cgroup_io_limit_defaults[type];
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (streq("infinity", p))
+ num = CGROUP_LIMIT_MAX;
+ else {
+ r = parse_size(p, 1000, &num);
+ if (r < 0 || num <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid IO limit '%s', ignoring.", p);
+ return 0;
+ }
+ }
+
+ LIST_FOREACH(device_limits, t, c->io_device_limits)
+ if (path_equal(resolved, t->path)) {
+ l = t;
+ break;
+ }
+
+ if (!l) {
+ CGroupIOLimitType ttype;
+
+ l = new0(CGroupIODeviceLimit, 1);
+ if (!l)
+ return log_oom();
+
+ l->path = TAKE_PTR(resolved);
+ for (ttype = 0; ttype < _CGROUP_IO_LIMIT_TYPE_MAX; ttype++)
+ l->limits[ttype] = cgroup_io_limit_defaults[ttype];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, l);
+ }
+
+ l->limits[type] = num;
+
+ return 0;
+}
+
+int config_parse_blockio_device_weight(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupBlockIODeviceWeight *w;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t u;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and weight from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or weight specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cg_blkio_weight_parse(p, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p);
+ return 0;
+ }
+
+ assert(u != CGROUP_BLKIO_WEIGHT_INVALID);
+
+ w = new0(CGroupBlockIODeviceWeight, 1);
+ if (!w)
+ return log_oom();
+
+ w->path = TAKE_PTR(resolved);
+ w->weight = u;
+
+ LIST_PREPEND(device_weights, c->blockio_device_weights, w);
+ return 0;
+}
+
+int config_parse_blockio_bandwidth(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupBlockIODeviceBandwidth *b = NULL;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t bytes;
+ bool read;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ read = streq("BlockIOReadBandwidth", lvalue);
+
+ if (isempty(rvalue)) {
+ LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) {
+ t->rbps = CGROUP_LIMIT_MAX;
+ t->wbps = CGROUP_LIMIT_MAX;
+ }
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = parse_size(p, 1000, &bytes);
+ if (r < 0 || bytes <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p);
+ return 0;
+ }
+
+ LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths)
+ if (path_equal(resolved, t->path)) {
+ b = t;
+ break;
+ }
+
+ if (!b) {
+ b = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!b)
+ return log_oom();
+
+ b->path = TAKE_PTR(resolved);
+ b->rbps = CGROUP_LIMIT_MAX;
+ b->wbps = CGROUP_LIMIT_MAX;
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b);
+ }
+
+ if (read)
+ b->rbps = bytes;
+ else
+ b->wbps = bytes;
+
+ return 0;
+}
+
+int config_parse_job_mode_isolate(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ JobMode *m = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue);
+
+ *m = r ? JOB_ISOLATE : JOB_REPLACE;
+ return 0;
+}
+
+int config_parse_exec_directories(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecDirectory *ed = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ exec_directory_done(ed);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *tuple = NULL;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ _cleanup_free_ char *src = NULL, *dest = NULL;
+ const char *q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r ?: SYNTHETIC_ERRNO(EINVAL),
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+
+ _cleanup_free_ char *sresolved = NULL;
+ r = unit_path_printf(u, src, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", src);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ if (path_startswith(sresolved, "private")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "%s= path can't be 'private', ignoring assignment: %s", lvalue, tuple);
+ continue;
+ }
+
+ /* For State and Runtime directories we support an optional destination parameter, which
+ * will be used to create a symlink to the source. */
+ _cleanup_free_ char *dresolved = NULL;
+ if (!isempty(dest)) {
+ if (streq(lvalue, "ConfigurationDirectory")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Destination parameter is not supported for ConfigurationDirectory, ignoring: %s", tuple);
+ continue;
+ }
+
+ r = unit_path_printf(u, dest, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", dest);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+ }
+
+ r = exec_directory_add(ed, sresolved, dresolved);
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_set_credential(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ _cleanup_free_ void *d = NULL;
+ ExecContext *context = ASSERT_PTR(data);
+ ExecSetCredential *old;
+ Unit *u = userdata;
+ bool encrypted = ltype;
+ const char *p = ASSERT_PTR(rvalue);
+ size_t size;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ context->set_credentials = hashmap_free(context->set_credentials);
+ return 0;
+ }
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract credential name, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_cred_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+ return 0;
+ }
+ if (!credential_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+ return 0;
+ }
+
+ if (encrypted) {
+ r = unbase64mem_full(p, SIZE_MAX, true, &d, &size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Encrypted credential data not valid Base64 data, ignoring.");
+ return 0;
+ }
+ } else {
+ char *unescaped;
+ ssize_t l;
+
+ /* We support escape codes here, so that users can insert trailing \n if they like */
+ l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped);
+ if (l < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, l, "Can't unescape \"%s\", ignoring: %m", p);
+ return 0;
+ }
+
+ d = unescaped;
+ size = l;
+ }
+
+ old = hashmap_get(context->set_credentials, k);
+ if (old) {
+ free_and_replace(old->data, d);
+ old->size = size;
+ old->encrypted = encrypted;
+ } else {
+ _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+ sc = new(ExecSetCredential, 1);
+ if (!sc)
+ return log_oom();
+
+ *sc = (ExecSetCredential) {
+ .id = TAKE_PTR(k),
+ .data = TAKE_PTR(d),
+ .size = size,
+ .encrypted = encrypted,
+ };
+
+ r = hashmap_ensure_put(&context->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Duplicated credential value '%s', ignoring assignment: %s", sc->id, rvalue);
+ return 0;
+ }
+
+ TAKE_PTR(sc);
+ }
+
+ return 0;
+}
+
+int config_parse_load_credential(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL;
+ ExecContext *context = ASSERT_PTR(data);
+ ExecLoadCredential *old;
+ bool encrypted = ltype;
+ Unit *u = userdata;
+ const char *p;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ context->load_credentials = hashmap_free(context->load_credentials);
+ return 0;
+ }
+
+ p = rvalue;
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_cred_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+ return 0;
+ }
+ if (!credential_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+ return 0;
+ }
+
+ if (isempty(p)) {
+ /* If only one field is specified take it as shortcut for inheriting a credential named
+ * the same way from our parent */
+ q = strdup(k);
+ if (!q)
+ return log_oom();
+ } else {
+ r = unit_path_printf(u, p, &q);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p);
+ return 0;
+ }
+ if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q);
+ return 0;
+ }
+ }
+
+ old = hashmap_get(context->load_credentials, k);
+ if (old) {
+ free_and_replace(old->path, q);
+ old->encrypted = encrypted;
+ } else {
+ _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+
+ lc = new(ExecLoadCredential, 1);
+ if (!lc)
+ return log_oom();
+
+ *lc = (ExecLoadCredential) {
+ .id = TAKE_PTR(k),
+ .path = TAKE_PTR(q),
+ .encrypted = encrypted,
+ };
+
+ r = hashmap_ensure_put(&context->load_credentials, &exec_load_credential_hash_ops, lc->id, lc);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Duplicated credential value '%s', ignoring assignment: %s", lc->id, rvalue);
+ return 0;
+ }
+
+ TAKE_PTR(lc);
+ }
+
+ return 0;
+}
+
+int config_parse_set_status(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExitStatusSet *status_set = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* Empty assignment resets the list */
+ if (isempty(rvalue)) {
+ exit_status_set_free(status_set);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ Bitmap *bitmap;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ /* We need to call exit_status_from_string() first, because we want
+ * to parse numbers as exit statuses, not signals. */
+
+ r = exit_status_from_string(word);
+ if (r >= 0) {
+ assert(r >= 0 && r < 256);
+ bitmap = &status_set->status;
+ } else {
+ r = signal_from_string(word);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse value, ignoring: %s", word);
+ continue;
+ }
+ bitmap = &status_set->signal;
+ }
+
+ r = bitmap_set(bitmap, r);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to set signal or status %s, ignoring: %m", word);
+ }
+}
+
+int config_parse_namespace_path_strv(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ char*** sv = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL;
+ const char *w;
+ bool ignore_enoent = false, shall_prefix = false;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ w = word;
+ if (startswith(w, "-")) {
+ ignore_enoent = true;
+ w++;
+ }
+ if (startswith(w, "+")) {
+ shall_prefix = true;
+ w++;
+ }
+
+ r = unit_path_printf(u, w, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ joined = strjoin(ignore_enoent ? "-" : "",
+ shall_prefix ? "+" : "",
+ resolved);
+
+ r = strv_push(sv, joined);
+ if (r < 0)
+ return log_oom();
+
+ joined = NULL;
+ }
+
+ return 0;
+}
+
+int config_parse_temporary_filesystems(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ ExecContext *c = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL;
+ const char *w;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ w = word;
+ r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", word);
+ continue;
+ }
+ if (r == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", word);
+ continue;
+ }
+
+ r = unit_path_printf(u, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w);
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_bind_paths(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+ char *s = NULL, *d = NULL;
+ bool rbind = true, ignore_enoent = false;
+
+ r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ r = unit_full_printf_full(u, source, PATH_MAX, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source);
+ continue;
+ }
+
+ s = sresolved;
+ if (s[0] == '-') {
+ ignore_enoent = true;
+ s++;
+ }
+
+ r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ /* Optionally, the destination is specified. */
+ if (p && p[-1] == ':') {
+ r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing argument after ':', ignoring: %s", s);
+ continue;
+ }
+
+ r = unit_path_printf(u, destination, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in \"%s\", ignoring: %m", destination);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ d = dresolved;
+
+ /* Optionally, there's also a short option string specified */
+ if (p && p[-1] == ':') {
+ _cleanup_free_ char *options = NULL;
+
+ r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (isempty(options) || streq(options, "rbind"))
+ rbind = true;
+ else if (streq(options, "norbind"))
+ rbind = false;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid option string, ignoring setting: %s", options);
+ continue;
+ }
+ }
+ } else
+ d = s;
+
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = !!strstr(lvalue, "ReadOnly"),
+ .recursive = rbind,
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_mount_images(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL;
+ _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+ const char *q = NULL;
+ char *s = NULL;
+ bool permissive = false;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+ if (r == 0)
+ continue;
+
+ s = first;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ r = unit_path_printf(u, s, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ if (isempty(second)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing destination in %s, ignoring: %s", lvalue, rvalue);
+ continue;
+ }
+
+ r = unit_path_printf(u, second, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in \"%s\", ignoring: %m", second);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+ return 0;
+ }
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = unit_full_printf(u, partition, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", first);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ break;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+ "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->mount_images, &c->n_mount_images,
+ &(MountImage) {
+ .source = sresolved,
+ .destination = dresolved,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_DISCRETE,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_extension_images(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *source = NULL, *tuple = NULL, *sresolved = NULL;
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ bool permissive = false;
+ const char *q = NULL;
+ char *s = NULL;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ q = tuple;
+ r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+ if (r == 0)
+ continue;
+
+ s = source;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ r = unit_path_printf(u, s, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+ return 0;
+ }
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = unit_full_printf(u, partition, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", partition);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ break;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->extension_images, &c->n_extension_images,
+ &(MountImage) {
+ .source = sresolved,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_EXTENSION,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_job_timeout_sec(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(data);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old
+ * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should
+ * count. */
+
+ if (!u->job_running_timeout_set)
+ u->job_running_timeout = usec;
+
+ u->job_timeout = usec;
+
+ return 0;
+}
+
+int config_parse_job_running_timeout_sec(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(data);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ u->job_running_timeout = usec;
+ u->job_running_timeout_set = true;
+
+ return 0;
+}
+
+int config_parse_emergency_action(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Manager *m = NULL;
+ EmergencyAction *x = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (unit)
+ m = ((Unit*) userdata)->manager;
+ else
+ m = data;
+
+ r = parse_emergency_action(rvalue, MANAGER_IS_SYSTEM(m), x);
+ if (r < 0) {
+ if (r == -EOPNOTSUPP && MANAGER_IS_USER(m)) {
+ /* Compat mode: remove for systemd 241. */
+
+ log_syntax(unit, LOG_INFO, filename, line, r,
+ "%s= in user mode specified as \"%s\", using \"exit-force\" instead.",
+ lvalue, rvalue);
+ *x = EMERGENCY_ACTION_EXIT_FORCE;
+ return 0;
+ }
+
+ if (r == -EOPNOTSUPP)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "%s= specified as %s mode action, ignoring: %s",
+ lvalue, MANAGER_IS_SYSTEM(m) ? "user" : "system", rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_pid_file(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL, *n = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ char **s = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* An empty assignment removes already set value. */
+ *s = mfree(*s);
+ return 0;
+ }
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ /* If this is a relative path make it absolute by prefixing the /run */
+ n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+ if (!n)
+ return log_oom();
+
+ /* Check that the result is a sensible path */
+ r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return r;
+
+ r = patch_var_run(unit, filename, line, lvalue, &n);
+ if (r < 0)
+ return r;
+
+ free_and_replace(*s, n);
+ return 0;
+}
+
+int config_parse_exit_status(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *exit_status = data, r;
+ uint8_t u;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(exit_status);
+
+ if (isempty(rvalue)) {
+ *exit_status = -1;
+ return 0;
+ }
+
+ r = safe_atou8(rvalue, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ *exit_status = u;
+ return 0;
+}
+
+int config_parse_disable_controllers(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r;
+ CGroupContext *c = data;
+ CGroupMask disabled_mask;
+
+ /* 1. If empty, make all controllers eligible for use again.
+ * 2. If non-empty, merge all listed controllers, space separated. */
+
+ if (isempty(rvalue)) {
+ c->disable_controllers = 0;
+ return 0;
+ }
+
+ r = cg_mask_from_string(rvalue, &disabled_mask);
+ if (r < 0 || disabled_mask <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
+ return 0;
+ }
+
+ c->disable_controllers |= disabled_mask;
+
+ return 0;
+}
+
+int config_parse_ip_filter_bpf_progs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *resolved = NULL;
+ const Unit *u = userdata;
+ char ***paths = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *paths = strv_free(*paths);
+ return 0;
+ }
+
+ r = unit_path_printf(u, rvalue, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (strv_contains(*paths, resolved))
+ return 0;
+
+ r = strv_extend(paths, resolved);
+ if (r < 0)
+ return log_oom();
+
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ static bool warned = false;
+
+ log_full(warned ? LOG_DEBUG : LOG_WARNING,
+ "File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n"
+ "Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue);
+
+ warned = true;
+ }
+
+ return 0;
+}
+
+int config_parse_bpf_foreign_program(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ _cleanup_free_ char *resolved = NULL, *word = NULL;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ Unit *u = userdata;
+ int attach_type, r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &word, ":", 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse foreign BPF program, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax in %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ attach_type = bpf_cgroup_attach_type_from_string(word);
+ if (attach_type < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown BPF attach type=%s, ignoring: %s", word, rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(u, p, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %s", p, rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cgroup_add_bpf_foreign_program(c, attach_type, resolved);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add foreign BPF program to cgroup context: %m");
+
+ return 0;
+}
+
+int config_parse_cgroup_socket_bind(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ _cleanup_free_ CGroupSocketBindItem *item = NULL;
+ CGroupSocketBindItem **head = data;
+ uint16_t nr_ports, port_min;
+ int af, ip_protocol, r;
+
+ if (isempty(rvalue)) {
+ cgroup_context_remove_socket_bind(head);
+ return 0;
+ }
+
+ r = parse_socket_bind_item(rvalue, &af, &ip_protocol, &nr_ports, &port_min);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Unable to parse %s= assignment, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom();
+ *item = (CGroupSocketBindItem) {
+ .address_family = af,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min,
+ };
+
+ LIST_PREPEND(socket_bind_items, *head, TAKE_PTR(item));
+
+ return 0;
+}
+
+int config_parse_restrict_network_interfaces(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ CGroupContext *c = ASSERT_PTR(data);
+ bool is_allow_rule = true;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ is_allow_rule = false;
+ rvalue++;
+ }
+
+ if (set_isempty(c->restrict_network_interfaces))
+ /* Only initialize this when creating the set */
+ c->restrict_network_interfaces_is_allow_list = is_allow_rule;
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+
+ if (!ifname_valid(word)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word);
+ continue;
+ }
+
+ if (c->restrict_network_interfaces_is_allow_list != is_allow_rule)
+ free(set_remove(c->restrict_network_interfaces, word));
+ else {
+ r = set_put_strdup(&c->restrict_network_interfaces, word);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ return 0;
+}
+
+static int merge_by_names(Unit **u, Set *names, const char *id) {
+ char *k;
+ int r;
+
+ assert(u);
+ assert(*u);
+
+ /* Let's try to add in all names that are aliases of this unit */
+ while ((k = set_steal_first(names))) {
+ _cleanup_free_ _unused_ char *free_k = k;
+
+ /* First try to merge in the other name into our unit */
+ r = unit_merge_by_name(*u, k);
+ if (r < 0) {
+ Unit *other;
+
+ /* Hmm, we couldn't merge the other unit into ours? Then let's try it the other way
+ * round. */
+
+ other = manager_get_unit((*u)->manager, k);
+ if (!other)
+ return r; /* return previous failure */
+
+ r = unit_merge(other, *u);
+ if (r < 0)
+ return r;
+
+ *u = other;
+ return merge_by_names(u, names, NULL);
+ }
+
+ if (streq_ptr(id, k))
+ unit_choose_id(*u, id);
+ }
+
+ return 0;
+}
+
+int unit_load_fragment(Unit *u) {
+ const char *fragment;
+ _cleanup_set_free_free_ Set *names = NULL;
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+ assert(u->id);
+
+ if (u->transient) {
+ u->access_selinux_context = mfree(u->access_selinux_context);
+ u->load_state = UNIT_LOADED;
+ return 0;
+ }
+
+ /* Possibly rebuild the fragment map to catch new units */
+ r = unit_file_build_name_map(&u->manager->lookup_paths,
+ &u->manager->unit_cache_timestamp_hash,
+ &u->manager->unit_id_map,
+ &u->manager->unit_name_map,
+ &u->manager->unit_path_cache);
+ if (r < 0)
+ return log_error_errno(r, "Failed to rebuild name map: %m");
+
+ r = unit_file_find_fragment(u->manager->unit_id_map,
+ u->manager->unit_name_map,
+ u->id,
+ &fragment,
+ &names);
+ if (r < 0 && r != -ENOENT)
+ return r;
+
+ if (fragment) {
+ /* Open the file, check if this is a mask, otherwise read. */
+ _cleanup_fclose_ FILE *f = NULL;
+ struct stat st;
+
+ /* Try to open the file name. A symlink is OK, for example for linked files or masks. We
+ * expect that all symlinks within the lookup paths have been already resolved, but we don't
+ * verify this here. */
+ f = fopen(fragment, "re");
+ if (!f)
+ return log_unit_notice_errno(u, errno, "Failed to open %s: %m", fragment);
+
+ if (fstat(fileno(f), &st) < 0)
+ return -errno;
+
+ r = free_and_strdup(&u->fragment_path, fragment);
+ if (r < 0)
+ return r;
+
+ if (null_or_empty(&st)) {
+ /* Unit file is masked */
+
+ u->load_state = u->perpetual ? UNIT_LOADED : UNIT_MASKED; /* don't allow perpetual units to ever be masked */
+ u->fragment_mtime = 0;
+ u->access_selinux_context = mfree(u->access_selinux_context);
+ } else {
+#if HAVE_SELINUX
+ if (mac_selinux_use()) {
+ _cleanup_freecon_ char *selcon = NULL;
+
+ /* Cache the SELinux context of the unit file here. We'll make use of when checking access permissions to loaded units */
+ r = fgetfilecon_raw(fileno(f), &selcon);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to read SELinux context of '%s', ignoring: %m", fragment);
+
+ r = free_and_strdup(&u->access_selinux_context, selcon);
+ if (r < 0)
+ return r;
+ } else
+#endif
+ u->access_selinux_context = mfree(u->access_selinux_context);
+
+ u->load_state = UNIT_LOADED;
+ u->fragment_mtime = timespec_load(&st.st_mtim);
+
+ /* Now, parse the file contents */
+ r = config_parse(u->id, fragment, f,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0,
+ u,
+ NULL);
+ if (r == -ENOEXEC)
+ log_unit_notice_errno(u, r, "Unit configuration has fatal error, unit will not be started.");
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Call merge_by_names with the name derived from the fragment path as the preferred name.
+ *
+ * We do the merge dance here because for some unit types, the unit might have aliases which are not
+ * declared in the file system. In particular, this is true (and frequent) for device and swap units.
+ */
+ const char *id = u->id;
+ _cleanup_free_ char *free_id = NULL;
+
+ if (fragment) {
+ id = basename(fragment);
+ if (unit_name_is_valid(id, UNIT_NAME_TEMPLATE)) {
+ assert(u->instance); /* If we're not trying to use a template for non-instanced unit,
+ * this must be set. */
+
+ r = unit_name_replace_instance(id, u->instance, &free_id);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to build id (%s + %s): %m", id, u->instance);
+ id = free_id;
+ }
+ }
+
+ Unit *merged = u;
+ r = merge_by_names(&merged, names, id);
+ if (r < 0)
+ return r;
+
+ if (merged != u)
+ u->load_state = UNIT_MERGED;
+
+ return 0;
+}
+
+void unit_dump_config_items(FILE *f) {
+ static const struct {
+ const ConfigParserCallback callback;
+ const char *rvalue;
+ } table[] = {
+ { config_parse_warn_compat, "NOTSUPPORTED" },
+ { config_parse_int, "INTEGER" },
+ { config_parse_unsigned, "UNSIGNED" },
+ { config_parse_iec_size, "SIZE" },
+ { config_parse_iec_uint64, "SIZE" },
+ { config_parse_si_uint64, "SIZE" },
+ { config_parse_bool, "BOOLEAN" },
+ { config_parse_string, "STRING" },
+ { config_parse_path, "PATH" },
+ { config_parse_unit_path_printf, "PATH" },
+ { config_parse_colon_separated_paths, "PATH" },
+ { config_parse_strv, "STRING [...]" },
+ { config_parse_exec_nice, "NICE" },
+ { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" },
+ { config_parse_exec_io_class, "IOCLASS" },
+ { config_parse_exec_io_priority, "IOPRIORITY" },
+ { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" },
+ { config_parse_exec_cpu_sched_prio, "CPUSCHEDPRIO" },
+ { config_parse_exec_cpu_affinity, "CPUAFFINITY" },
+ { config_parse_mode, "MODE" },
+ { config_parse_unit_env_file, "FILE" },
+ { config_parse_exec_output, "OUTPUT" },
+ { config_parse_exec_input, "INPUT" },
+ { config_parse_log_facility, "FACILITY" },
+ { config_parse_log_level, "LEVEL" },
+ { config_parse_exec_secure_bits, "SECUREBITS" },
+ { config_parse_capability_set, "BOUNDINGSET" },
+ { config_parse_rlimit, "LIMIT" },
+ { config_parse_unit_deps, "UNIT [...]" },
+ { config_parse_exec, "PATH [ARGUMENT [...]]" },
+ { config_parse_service_type, "SERVICETYPE" },
+ { config_parse_service_exit_type, "SERVICEEXITTYPE" },
+ { config_parse_service_restart, "SERVICERESTART" },
+ { config_parse_service_timeout_failure_mode, "TIMEOUTMODE" },
+ { config_parse_kill_mode, "KILLMODE" },
+ { config_parse_signal, "SIGNAL" },
+ { config_parse_socket_listen, "SOCKET [...]" },
+ { config_parse_socket_bind, "SOCKETBIND" },
+ { config_parse_socket_bindtodevice, "NETWORKINTERFACE" },
+ { config_parse_sec, "SECONDS" },
+ { config_parse_nsec, "NANOSECONDS" },
+ { config_parse_namespace_path_strv, "PATH [...]" },
+ { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" },
+ { config_parse_unit_requires_mounts_for, "PATH [...]" },
+ { config_parse_exec_mount_flags, "MOUNTFLAG [...]" },
+ { config_parse_unit_string_printf, "STRING" },
+ { config_parse_trigger_unit, "UNIT" },
+ { config_parse_timer, "TIMER" },
+ { config_parse_path_spec, "PATH" },
+ { config_parse_notify_access, "ACCESS" },
+ { config_parse_ip_tos, "TOS" },
+ { config_parse_unit_condition_path, "CONDITION" },
+ { config_parse_unit_condition_string, "CONDITION" },
+ { config_parse_unit_slice, "SLICE" },
+ { config_parse_documentation, "URL" },
+ { config_parse_service_timeout, "SECONDS" },
+ { config_parse_emergency_action, "ACTION" },
+ { config_parse_set_status, "STATUS" },
+ { config_parse_service_sockets, "SOCKETS" },
+ { config_parse_environ, "ENVIRON" },
+#if HAVE_SECCOMP
+ { config_parse_syscall_filter, "SYSCALLS" },
+ { config_parse_syscall_archs, "ARCHS" },
+ { config_parse_syscall_errno, "ERRNO" },
+ { config_parse_syscall_log, "SYSCALLS" },
+ { config_parse_address_families, "FAMILIES" },
+ { config_parse_restrict_namespaces, "NAMESPACES" },
+#endif
+ { config_parse_restrict_filesystems, "FILESYSTEMS" },
+ { config_parse_cpu_shares, "SHARES" },
+ { config_parse_cg_weight, "WEIGHT" },
+ { config_parse_cg_cpu_weight, "CPUWEIGHT" },
+ { config_parse_memory_limit, "LIMIT" },
+ { config_parse_device_allow, "DEVICE" },
+ { config_parse_device_policy, "POLICY" },
+ { config_parse_io_limit, "LIMIT" },
+ { config_parse_io_device_weight, "DEVICEWEIGHT" },
+ { config_parse_io_device_latency, "DEVICELATENCY" },
+ { config_parse_blockio_bandwidth, "BANDWIDTH" },
+ { config_parse_blockio_weight, "WEIGHT" },
+ { config_parse_blockio_device_weight, "DEVICEWEIGHT" },
+ { config_parse_long, "LONG" },
+ { config_parse_socket_service, "SERVICE" },
+#if HAVE_SELINUX
+ { config_parse_exec_selinux_context, "LABEL" },
+#endif
+ { config_parse_job_mode, "MODE" },
+ { config_parse_job_mode_isolate, "BOOLEAN" },
+ { config_parse_personality, "PERSONALITY" },
+ };
+
+ const char *prev = NULL;
+ const char *i;
+
+ assert(f);
+
+ NULSTR_FOREACH(i, load_fragment_gperf_nulstr) {
+ const char *rvalue = "OTHER", *lvalue;
+ const ConfigPerfItem *p;
+ const char *dot;
+
+ assert_se(p = load_fragment_gperf_lookup(i, strlen(i)));
+
+ /* Hide legacy settings */
+ if (p->parse == config_parse_warn_compat &&
+ p->ltype == DISABLED_LEGACY)
+ continue;
+
+ for (size_t j = 0; j < ELEMENTSOF(table); j++)
+ if (p->parse == table[j].callback) {
+ rvalue = table[j].rvalue;
+ break;
+ }
+
+ dot = strchr(i, '.');
+ lvalue = dot ? dot + 1 : i;
+
+ if (dot) {
+ size_t prefix_len = dot - i;
+
+ if (!prev || !strneq(prev, i, prefix_len+1)) {
+ if (prev)
+ fputc('\n', f);
+
+ fprintf(f, "[%.*s]\n", (int) prefix_len, i);
+ }
+ }
+
+ fprintf(f, "%s=%s\n", lvalue, rvalue);
+ prev = i;
+ }
+}
+
+int config_parse_cpu_affinity2(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CPUSet *affinity = ASSERT_PTR(data);
+
+ (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue);
+
+ return 0;
+}
+
+int config_parse_show_status(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int k;
+ ShowStatus *b = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ k = parse_show_status(rvalue, b);
+ if (k < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_output_restricted(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecOutput t, *eo = ASSERT_PTR(data);
+ bool obsolete = false;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "syslog")) {
+ t = EXEC_OUTPUT_JOURNAL;
+ obsolete = true;
+ } else if (streq(rvalue, "syslog+console")) {
+ t = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+ obsolete = true;
+ } else {
+ t = exec_output_from_string(rvalue);
+ if (t < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, t, "Failed to parse output type, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Standard output types socket, fd:, file:, append:, truncate: are not supported as defaults, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ if (obsolete)
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "Standard output type %s is obsolete, automatically updating to %s. Please update your configuration.",
+ rvalue, exec_output_to_string(t));
+
+ *eo = t;
+ return 0;
+}
+
+int config_parse_crash_chvt(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ r = parse_crash_chvt(rvalue, data);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_swap_priority(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Swap *s = ASSERT_PTR(userdata);
+ int r, priority;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ s->parameters_fragment.priority = -1;
+ s->parameters_fragment.priority_set = false;
+ return 0;
+ }
+
+ r = safe_atoi(rvalue, &priority);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid swap priority '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ if (priority < -1) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Sorry, swap priorities smaller than -1 may only be assigned by the kernel itself, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (priority > 32767) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Swap priority out of range, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ s->parameters_fragment.priority = priority;
+ s->parameters_fragment.priority_set = true;
+ return 0;
+}
+
+int config_parse_watchdog_sec(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ usec_t *usec = data;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* This is called for {Runtime,Reboot,KExec}WatchdogSec= where "default" maps to
+ * USEC_INFINITY internally. */
+
+ if (streq(rvalue, "default"))
+ *usec = USEC_INFINITY;
+ else if (streq(rvalue, "off"))
+ *usec = 0;
+ else
+ return config_parse_sec(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+
+ return 0;
+}
+
+int config_parse_tty_size(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ unsigned *sz = data;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *sz = UINT_MAX;
+ return 0;
+ }
+
+ return config_parse_unsigned(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
new file mode 100644
index 0000000..c57a6b2
--- /dev/null
+++ b/src/core/load-fragment.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "unit.h"
+
+/* These functions are declared in the header to make them accessible to unit tests. */
+bool contains_instance_specifier_superset(const char *s);
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format);
+
+/* Config-parsing helpers relevant only for sources under src/core/ */
+int parse_crash_chvt(const char *value, int *data);
+int parse_confirm_spawn(const char *value, char **console);
+
+/* Read service data from .desktop file style configuration fragments */
+
+int unit_load_fragment(Unit *u);
+
+void unit_dump_config_items(FILE *f);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_documentation);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_coredump_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_failure_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_exit_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_restart);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_output);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_apivfs);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits);
+CONFIG_PARSER_PROTOTYPE(config_parse_root_image_options);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash_sig);
+CONFIG_PARSER_PROTOTYPE(config_parse_capability_set);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_flags);
+CONFIG_PARSER_PROTOTYPE(config_parse_timer);
+CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit);
+CONFIG_PARSER_PROTOTYPE(config_parse_path_spec);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_service);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string);
+CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_notify_access);
+CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_log);
+CONFIG_PARSER_PROTOTYPE(config_parse_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_cpu_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label);
+CONFIG_PARSER_PROTOTYPE(config_parse_address_families);
+CONFIG_PARSER_PROTOTYPE(config_parse_runtime_preserve_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_load_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
+CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
+CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_home);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_system);
+CONFIG_PARSER_PROTOTYPE(config_parse_bus_name);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_working_directory);
+CONFIG_PARSER_PROTOTYPE(config_parse_fdname);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
+CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_namespace);
+CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
+CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity2);
+CONFIG_PARSER_PROTOTYPE(config_parse_show_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_status_unit_format);
+CONFIG_PARSER_PROTOTYPE(config_parse_output_restricted);
+CONFIG_PARSER_PROTOTYPE(config_parse_crash_chvt);
+CONFIG_PARSER_PROTOTYPE(config_parse_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_mount_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping);
+CONFIG_PARSER_PROTOTYPE(config_parse_extension_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program);
+CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_network_interfaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_tty_size);
+
+/* gperf prototypes */
+const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
+extern const char load_fragment_gperf_nulstr[];
diff --git a/src/core/main.c b/src/core/main.c
new file mode 100644
index 0000000..1c4b464
--- /dev/null
+++ b/src/core/main.c
@@ -0,0 +1,3096 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/oom.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#if HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+#if HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+#include "sd-bus.h"
+#include "sd-daemon.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "apparmor-setup.h"
+#include "architecture.h"
+#if HAVE_LIBBPF
+#include "bpf-lsm.h"
+#endif
+#include "build.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "cgroup-util.h"
+#include "clock-util.h"
+#include "conf-parser.h"
+#include "cpu-set-util.h"
+#include "crash-handler.h"
+#include "dbus-manager.h"
+#include "dbus.h"
+#include "def.h"
+#include "dev-setup.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "emergency-action.h"
+#include "env-util.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fdset.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "hostname-setup.h"
+#include "ima-setup.h"
+#include "import-creds.h"
+#include "killall.h"
+#include "kmod-setup.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "loopback-setup.h"
+#include "machine-id-setup.h"
+#include "main.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "os-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "rlimit-util.h"
+#if HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-setup.h"
+#include "special.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "switch-root.h"
+#include "sysctl-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+#include "util.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+#include <sanitizer/lsan_interface.h>
+#endif
+
+#define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */
+
+static enum {
+ ACTION_RUN,
+ ACTION_HELP,
+ ACTION_VERSION,
+ ACTION_TEST,
+ ACTION_DUMP_CONFIGURATION_ITEMS,
+ ACTION_DUMP_BUS_PROPERTIES,
+ ACTION_BUS_INTROSPECT,
+} arg_action = ACTION_RUN;
+
+static const char *arg_bus_introspect = NULL;
+
+/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
+ * defaults are assigned in reset_arguments() below. */
+static char *arg_default_unit;
+static bool arg_system;
+bool arg_dump_core;
+int arg_crash_chvt;
+bool arg_crash_shell;
+bool arg_crash_reboot;
+static char *arg_confirm_spawn;
+static ShowStatus arg_show_status;
+static StatusUnitFormat arg_status_unit_format;
+static bool arg_switched_root;
+static PagerFlags arg_pager_flags;
+static bool arg_service_watchdogs;
+static ExecOutput arg_default_std_output;
+static ExecOutput arg_default_std_error;
+static usec_t arg_default_restart_usec;
+static usec_t arg_default_timeout_start_usec;
+static usec_t arg_default_timeout_stop_usec;
+static usec_t arg_default_timeout_abort_usec;
+static usec_t arg_default_device_timeout_usec;
+static bool arg_default_timeout_abort_set;
+static usec_t arg_default_start_limit_interval;
+static unsigned arg_default_start_limit_burst;
+static usec_t arg_runtime_watchdog;
+static usec_t arg_reboot_watchdog;
+static usec_t arg_kexec_watchdog;
+static usec_t arg_pretimeout_watchdog;
+static char *arg_early_core_pattern;
+static char *arg_watchdog_pretimeout_governor;
+static char *arg_watchdog_device;
+static char **arg_default_environment;
+static char **arg_manager_environment;
+static struct rlimit *arg_default_rlimit[_RLIMIT_MAX];
+static uint64_t arg_capability_bounding_set;
+static bool arg_no_new_privs;
+static nsec_t arg_timer_slack_nsec;
+static usec_t arg_default_timer_accuracy_usec;
+static Set* arg_syscall_archs;
+static FILE* arg_serialization;
+static int arg_default_cpu_accounting;
+static bool arg_default_io_accounting;
+static bool arg_default_ip_accounting;
+static bool arg_default_blockio_accounting;
+static bool arg_default_memory_accounting;
+static bool arg_default_tasks_accounting;
+static TasksMax arg_default_tasks_max;
+static sd_id128_t arg_machine_id;
+static EmergencyAction arg_cad_burst_action;
+static OOMPolicy arg_default_oom_policy;
+static CPUSet arg_cpu_affinity;
+static NUMAPolicy arg_numa_policy;
+static usec_t arg_clock_usec;
+static void *arg_random_seed;
+static size_t arg_random_seed_size;
+static int arg_default_oom_score_adjust;
+static bool arg_default_oom_score_adjust_set;
+static char *arg_default_smack_process_label;
+
+/* A copy of the original environment block */
+static char **saved_env = NULL;
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock);
+
+static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
+ _cleanup_free_ char *base = NULL;
+ _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+ int r;
+
+ r = xdg_user_config_dir(&base, "/systemd");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(&files, "%s/user.conf", base);
+ if (r < 0)
+ return r;
+
+ r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
+ if (r < 0)
+ return r;
+
+ r = strv_consume(&dirs, TAKE_PTR(base));
+ if (r < 0)
+ return r;
+
+ r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
+ if (r < 0)
+ return r;
+
+ *ret_files = TAKE_PTR(files);
+ *ret_dirs = TAKE_PTR(dirs);
+ return 0;
+}
+
+static int console_setup(void) {
+ _cleanup_close_ int tty_fd = -1;
+ int r;
+
+ tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (tty_fd < 0)
+ return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
+
+ /* We don't want to force text mode. plymouth may be showing
+ * pictures already from initrd. */
+ r = reset_terminal_fd(tty_fd, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset /dev/console: %m");
+
+ return 0;
+}
+
+static int set_machine_id(const char *m) {
+ sd_id128_t t;
+ assert(m);
+
+ if (sd_id128_from_string(m, &t) < 0)
+ return -EINVAL;
+
+ if (sd_id128_is_null(t))
+ return -EINVAL;
+
+ arg_machine_id = t;
+ return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+ int r;
+
+ assert(key);
+
+ if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
+ else if (in_initrd() == !!startswith(key, "rd."))
+ return free_and_strdup_warn(&arg_default_unit, value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
+ else
+ arg_dump_core = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (path_is_absolute(value))
+ (void) parse_path_argument(value, false, &arg_early_core_pattern);
+ else
+ log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
+
+ if (!value)
+ arg_crash_chvt = 0; /* turn on */
+ else {
+ r = parse_crash_chvt(value, &arg_crash_chvt);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
+ else
+ arg_crash_shell = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
+ else
+ arg_crash_reboot = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
+ char *s;
+
+ r = parse_confirm_spawn(value, &s);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
+ else
+ free_and_replace(arg_confirm_spawn, s);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
+ else
+ arg_service_watchdogs = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
+
+ if (value) {
+ r = parse_show_status(value, &arg_show_status);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
+ } else
+ arg_show_status = SHOW_STATUS_YES;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = status_unit_format_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
+ else
+ arg_status_unit_format = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = exec_output_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
+ else
+ arg_default_std_output = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = exec_output_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
+ else
+ arg_default_std_error = r;
+
+ } else if (streq(key, "systemd.setenv")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (!env_assignment_is_valid(value))
+ log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
+ else {
+ r = strv_env_replace_strdup(&arg_default_environment, value);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = set_machine_id(value);
+ if (r < 0)
+ log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_sec(value, &arg_default_timeout_start_usec);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
+
+ if (arg_default_timeout_start_usec <= 0)
+ arg_default_timeout_start_usec = USEC_INFINITY;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_cpu_set(value, &arg_cpu_affinity);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ (void) parse_path_argument(value, false, &arg_watchdog_device);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (streq(value, "default"))
+ arg_runtime_watchdog = USEC_INFINITY;
+ else if (streq(value, "off"))
+ arg_runtime_watchdog = 0;
+ else {
+ r = parse_sec(value, &arg_runtime_watchdog);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+ }
+
+ arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (streq(value, "default"))
+ arg_pretimeout_watchdog = USEC_INFINITY;
+ else if (streq(value, "off"))
+ arg_pretimeout_watchdog = 0;
+ else {
+ r = parse_sec(value, &arg_pretimeout_watchdog);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
+
+ if (proc_cmdline_value_missing(key, value) || isempty(value)) {
+ arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+ return 0;
+ }
+
+ if (!string_is_safe(value)) {
+ log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = safe_atou64(value, &arg_clock_usec);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
+ void *p;
+ size_t sz;
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = unbase64mem(value, SIZE_MAX, &p, &sz);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
+
+ free(arg_random_seed);
+ arg_random_seed = sz > 0 ? p : mfree(p);
+ arg_random_seed_size = sz;
+
+ } else if (streq(key, "quiet") && !value) {
+
+ if (arg_show_status == _SHOW_STATUS_INVALID)
+ arg_show_status = SHOW_STATUS_ERROR;
+
+ } else if (streq(key, "debug") && !value) {
+
+ /* Note that log_parse_environment() handles 'debug'
+ * too, and sets the log level to LOG_DEBUG. */
+
+ if (detect_container() > 0)
+ log_set_target(LOG_TARGET_CONSOLE);
+
+ } else if (!value) {
+ const char *target;
+
+ /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
+ target = runlevel_to_target(key);
+ if (target)
+ return free_and_strdup_warn(&arg_default_unit, target);
+ }
+
+ return 0;
+}
+
+#define DEFINE_SETTER(name, func, descr) \
+ static int name(const char *unit, \
+ const char *filename, \
+ unsigned line, \
+ const char *section, \
+ unsigned section_line, \
+ const char *lvalue, \
+ int ltype, \
+ const char *rvalue, \
+ void *data, \
+ void *userdata) { \
+ \
+ int r; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ \
+ r = func(rvalue); \
+ if (r < 0) \
+ log_syntax(unit, LOG_ERR, filename, line, r, \
+ "Invalid " descr "'%s': %m", \
+ rvalue); \
+ \
+ return 0; \
+ }
+
+DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
+DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
+DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
+DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
+DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
+
+static int config_parse_default_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ int r;
+
+ r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
+ &arg_default_timeout_abort_usec, userdata);
+ if (r >= 0)
+ arg_default_timeout_abort_set = r;
+ return 0;
+}
+
+static int config_parse_oom_score_adjust(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int oa, r;
+
+ if (isempty(rvalue)) {
+ arg_default_oom_score_adjust_set = false;
+ return 0;
+ }
+
+ r = parse_oom_score_adjust(rvalue, &oa);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ arg_default_oom_score_adjust = oa;
+ arg_default_oom_score_adjust_set = true;
+
+ return 0;
+}
+
+static int parse_config_file(void) {
+ const ConfigTableItem items[] = {
+ { "Manager", "LogLevel", config_parse_level2, 0, NULL },
+ { "Manager", "LogTarget", config_parse_target, 0, NULL },
+ { "Manager", "LogColor", config_parse_color, 0, NULL },
+ { "Manager", "LogLocation", config_parse_location, 0, NULL },
+ { "Manager", "LogTime", config_parse_time, 0, NULL },
+ { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
+ { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
+ { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
+ { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
+ { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
+ { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
+ { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
+ { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
+ { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
+ { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
+ { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+ { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog },
+ { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog },
+ { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog },
+ { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
+ { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog },
+ { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
+ { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
+ { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
+ { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
+#if HAVE_SECCOMP
+ { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
+#else
+ { "Manager", "SystemCallArchitectures", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+
+#endif
+ { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
+ { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec },
+ { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_default_std_output },
+ { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_default_std_error },
+ { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
+ { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
+ { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
+ { "Manager", "DefaultDeviceTimeoutSec", config_parse_sec, 0, &arg_default_device_timeout_usec },
+ { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
+ { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
+ { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_default_start_limit_interval },
+ { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst },
+ { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment },
+ { "Manager", "ManagerEnvironment", config_parse_environ, 0, &arg_manager_environment },
+ { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit },
+ { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit },
+ { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit },
+ { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit },
+ { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit },
+ { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit },
+ { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit },
+ { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit },
+ { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit },
+ { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit },
+ { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit },
+ { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit },
+ { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit },
+ { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit },
+ { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit },
+ { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit },
+ { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting },
+ { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting },
+ { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting },
+ { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting },
+ { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting },
+ { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
+ { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
+ { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
+ { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
+ { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL },
+#if ENABLE_SMACK
+ { "Manager", "DefaultSmackProcessLabel", config_parse_string, 0, &arg_default_smack_process_label },
+#else
+ { "Manager", "DefaultSmackProcessLabel", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+#endif
+ {}
+ };
+
+ _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+ const char *suffix;
+ int r;
+
+ if (arg_system)
+ suffix = "system.conf.d";
+ else {
+ r = manager_find_user_config_paths(&files, &dirs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine config file paths: %m");
+
+ suffix = "user.conf.d";
+ }
+
+ (void) config_parse_many(
+ (const char* const*) (files ?: STRV_MAKE(PKGSYSCONFDIR "/system.conf")),
+ (const char* const*) (dirs ?: CONF_PATHS_STRV("systemd")),
+ suffix,
+ "Manager\0",
+ config_item_table_lookup, items,
+ CONFIG_PARSE_WARN,
+ NULL,
+ NULL,
+ NULL);
+
+ /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
+ * USEC_INFINITY like everywhere else. */
+ if (arg_default_timeout_start_usec <= 0)
+ arg_default_timeout_start_usec = USEC_INFINITY;
+ if (arg_default_timeout_stop_usec <= 0)
+ arg_default_timeout_stop_usec = USEC_INFINITY;
+
+ return 0;
+}
+
+static void set_manager_defaults(Manager *m) {
+
+ assert(m);
+
+ /* Propagates the various default unit property settings into the manager object, i.e. properties that do not
+ * affect the manager itself, but are just what newly allocated units will have set if they haven't set
+ * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */
+
+ m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec;
+ m->default_std_output = arg_default_std_output;
+ m->default_std_error = arg_default_std_error;
+ m->default_timeout_start_usec = arg_default_timeout_start_usec;
+ m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
+ m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
+ m->default_timeout_abort_set = arg_default_timeout_abort_set;
+ m->default_device_timeout_usec = arg_default_device_timeout_usec;
+ m->default_restart_usec = arg_default_restart_usec;
+ m->default_start_limit_interval = arg_default_start_limit_interval;
+ m->default_start_limit_burst = arg_default_start_limit_burst;
+
+ /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
+ * controller to be enabled, so the default is to enable it unless we got told otherwise. */
+ if (arg_default_cpu_accounting >= 0)
+ m->default_cpu_accounting = arg_default_cpu_accounting;
+ else
+ m->default_cpu_accounting = cpu_accounting_is_cheap();
+
+ m->default_io_accounting = arg_default_io_accounting;
+ m->default_ip_accounting = arg_default_ip_accounting;
+ m->default_blockio_accounting = arg_default_blockio_accounting;
+ m->default_memory_accounting = arg_default_memory_accounting;
+ m->default_tasks_accounting = arg_default_tasks_accounting;
+ m->default_tasks_max = arg_default_tasks_max;
+ m->default_oom_policy = arg_default_oom_policy;
+ m->default_oom_score_adjust_set = arg_default_oom_score_adjust_set;
+ m->default_oom_score_adjust = arg_default_oom_score_adjust;
+
+ (void) manager_set_default_smack_process_label(m, arg_default_smack_process_label);
+
+ (void) manager_set_default_rlimits(m, arg_default_rlimit);
+
+ (void) manager_default_environment(m);
+ (void) manager_transient_environment_add(m, arg_default_environment);
+}
+
+static void set_manager_settings(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Propagates the various manager settings into the manager object, i.e. properties that
+ * effect the manager itself (as opposed to just being inherited into newly allocated
+ * units, see set_manager_defaults() above). */
+
+ m->confirm_spawn = arg_confirm_spawn;
+ m->service_watchdogs = arg_service_watchdogs;
+ m->cad_burst_action = arg_cad_burst_action;
+
+ manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
+ manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
+ manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
+ manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
+ r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
+
+ manager_set_show_status(m, arg_show_status, "commandline");
+ m->status_unit_format = arg_status_unit_format;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_LOG_LEVEL = 0x100,
+ ARG_LOG_TARGET,
+ ARG_LOG_COLOR,
+ ARG_LOG_LOCATION,
+ ARG_LOG_TIME,
+ ARG_UNIT,
+ ARG_SYSTEM,
+ ARG_USER,
+ ARG_TEST,
+ ARG_NO_PAGER,
+ ARG_VERSION,
+ ARG_DUMP_CONFIGURATION_ITEMS,
+ ARG_DUMP_BUS_PROPERTIES,
+ ARG_BUS_INTROSPECT,
+ ARG_DUMP_CORE,
+ ARG_CRASH_CHVT,
+ ARG_CRASH_SHELL,
+ ARG_CRASH_REBOOT,
+ ARG_CONFIRM_SPAWN,
+ ARG_SHOW_STATUS,
+ ARG_DESERIALIZE,
+ ARG_SWITCHED_ROOT,
+ ARG_DEFAULT_STD_OUTPUT,
+ ARG_DEFAULT_STD_ERROR,
+ ARG_MACHINE_ID,
+ ARG_SERVICE_WATCHDOGS,
+ };
+
+ static const struct option options[] = {
+ { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
+ { "log-target", required_argument, NULL, ARG_LOG_TARGET },
+ { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
+ { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
+ { "log-time", optional_argument, NULL, ARG_LOG_TIME },
+ { "unit", required_argument, NULL, ARG_UNIT },
+ { "system", no_argument, NULL, ARG_SYSTEM },
+ { "user", no_argument, NULL, ARG_USER },
+ { "test", no_argument, NULL, ARG_TEST },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS },
+ { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES },
+ { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
+ { "dump-core", optional_argument, NULL, ARG_DUMP_CORE },
+ { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT },
+ { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL },
+ { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT },
+ { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN },
+ { "show-status", optional_argument, NULL, ARG_SHOW_STATUS },
+ { "deserialize", required_argument, NULL, ARG_DESERIALIZE },
+ { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT },
+ { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, },
+ { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, },
+ { "machine-id", required_argument, NULL, ARG_MACHINE_ID },
+ { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS },
+ {}
+ };
+
+ int c, r;
+ bool user_arg_seen = false;
+
+ assert(argc >= 1);
+ assert(argv);
+
+ if (getpid_cached() == 1)
+ opterr = 0;
+
+ while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0)
+
+ switch (c) {
+
+ case ARG_LOG_LEVEL:
+ r = log_set_max_level_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_TARGET:
+ r = log_set_target_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_COLOR:
+
+ if (optarg) {
+ r = log_show_color_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_color(true);
+
+ break;
+
+ case ARG_LOG_LOCATION:
+ if (optarg) {
+ r = log_show_location_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_location(true);
+
+ break;
+
+ case ARG_LOG_TIME:
+
+ if (optarg) {
+ r = log_show_time_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_time(true);
+
+ break;
+
+ case ARG_DEFAULT_STD_OUTPUT:
+ r = exec_output_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
+ optarg);
+ arg_default_std_output = r;
+ break;
+
+ case ARG_DEFAULT_STD_ERROR:
+ r = exec_output_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
+ optarg);
+ arg_default_std_error = r;
+ break;
+
+ case ARG_UNIT:
+ r = free_and_strdup(&arg_default_unit, optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_SYSTEM:
+ arg_system = true;
+ break;
+
+ case ARG_USER:
+ arg_system = false;
+ user_arg_seen = true;
+ break;
+
+ case ARG_TEST:
+ arg_action = ACTION_TEST;
+ break;
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
+ case ARG_VERSION:
+ arg_action = ACTION_VERSION;
+ break;
+
+ case ARG_DUMP_CONFIGURATION_ITEMS:
+ arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
+ break;
+
+ case ARG_DUMP_BUS_PROPERTIES:
+ arg_action = ACTION_DUMP_BUS_PROPERTIES;
+ break;
+
+ case ARG_BUS_INTROSPECT:
+ arg_bus_introspect = optarg;
+ arg_action = ACTION_BUS_INTROSPECT;
+ break;
+
+ case ARG_DUMP_CORE:
+ r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CRASH_CHVT:
+ r = parse_crash_chvt(optarg, &arg_crash_chvt);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
+ optarg);
+ break;
+
+ case ARG_CRASH_SHELL:
+ r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CRASH_REBOOT:
+ r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CONFIRM_SPAWN:
+ arg_confirm_spawn = mfree(arg_confirm_spawn);
+
+ r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
+ optarg);
+ break;
+
+ case ARG_SERVICE_WATCHDOGS:
+ r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_SHOW_STATUS:
+ if (optarg) {
+ r = parse_show_status(optarg, &arg_show_status);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
+ optarg);
+ } else
+ arg_show_status = SHOW_STATUS_YES;
+ break;
+
+ case ARG_DESERIALIZE: {
+ int fd;
+ FILE *f;
+
+ r = safe_atoi(optarg, &fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg);
+ if (fd < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid deserialize fd: %d",
+ fd);
+
+ (void) fd_cloexec(fd, true);
+
+ f = fdopen(fd, "r");
+ if (!f)
+ return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+ safe_fclose(arg_serialization);
+ arg_serialization = f;
+
+ break;
+ }
+
+ case ARG_SWITCHED_ROOT:
+ arg_switched_root = true;
+ break;
+
+ case ARG_MACHINE_ID:
+ r = set_machine_id(optarg);
+ if (r < 0)
+ return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
+ break;
+
+ case 'h':
+ arg_action = ACTION_HELP;
+ break;
+
+ case 'D':
+ log_set_max_level(LOG_DEBUG);
+ break;
+
+ case 'b':
+ case 's':
+ case 'z':
+ /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
+ * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
+ */
+ case '?':
+ if (getpid_cached() != 1)
+ return -EINVAL;
+ else
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (optind < argc && getpid_cached() != 1)
+ /* Hmm, when we aren't run as init system let's complain about excess arguments */
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
+
+ if (arg_action == ACTION_RUN && !arg_system && !user_arg_seen)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Explicit --user argument required to run as user manager.");
+
+ return 0;
+}
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man("systemd", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "%sStarts and monitors system and user services.%s\n\n"
+ "This program takes no positional arguments.\n\n"
+ "%sOptions%s:\n"
+ " -h --help Show this help\n"
+ " --version Show version\n"
+ " --test Determine initial transaction, dump it and exit\n"
+ " --system Combined with --test: operate in system mode\n"
+ " --user Combined with --test: operate in user mode\n"
+ " --dump-configuration-items Dump understood unit configuration items\n"
+ " --dump-bus-properties Dump exposed bus properties\n"
+ " --bus-introspect=PATH Write XML introspection data\n"
+ " --unit=UNIT Set default unit\n"
+ " --dump-core[=BOOL] Dump core on crash\n"
+ " --crash-vt=NR Change to specified VT on crash\n"
+ " --crash-reboot[=BOOL] Reboot on crash\n"
+ " --crash-shell[=BOOL] Run shell on crash\n"
+ " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
+ " --show-status[=BOOL] Show status updates on the console during boot\n"
+ " --log-target=TARGET Set log target (console, journal, kmsg,\n"
+ " journal-or-kmsg, null)\n"
+ " --log-level=LEVEL Set log level (debug, info, notice, warning,\n"
+ " err, crit, alert, emerg)\n"
+ " --log-color[=BOOL] Highlight important log messages\n"
+ " --log-location[=BOOL] Include code location in log messages\n"
+ " --log-time[=BOOL] Prefix log messages with current time\n"
+ " --default-standard-output= Set default standard output for services\n"
+ " --default-standard-error= Set default standard error output for services\n"
+ " --no-pager Do not pipe output into a pager\n"
+ "\nSee the %s for details.\n",
+ program_invocation_short_name,
+ ansi_highlight(),
+ ansi_normal(),
+ ansi_underline(),
+ ansi_normal(),
+ link);
+
+ return 0;
+}
+
+static int prepare_reexecute(
+ Manager *m,
+ FILE **ret_f,
+ FDSet **ret_fds,
+ bool switching_root) {
+
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(m);
+ assert(ret_f);
+ assert(ret_fds);
+
+ r = manager_open_serialization(m, &f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create serialization file: %m");
+
+ /* Make sure nothing is really destructed when we shut down */
+ m->n_reloading++;
+ bus_manager_send_reloading(m, true);
+
+ fds = fdset_new();
+ if (!fds)
+ return log_oom();
+
+ r = manager_serialize(m, f, fds, switching_root);
+ if (r < 0)
+ return r;
+
+ if (fseeko(f, 0, SEEK_SET) == (off_t) -1)
+ return log_error_errno(errno, "Failed to rewind serialization fd: %m");
+
+ r = fd_cloexec(fileno(f), false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
+
+ r = fdset_cloexec(fds, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
+
+ *ret_f = TAKE_PTR(f);
+ *ret_fds = TAKE_PTR(fds);
+
+ return 0;
+}
+
+static void bump_file_max_and_nr_open(void) {
+
+ /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
+ * numbers of file descriptors are no longer a performance problem and their memory is properly
+ * tracked by memcg, thus counting them and limiting them in another two layers of limits is
+ * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
+ * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
+
+#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
+ int r;
+#endif
+
+#if BUMP_PROC_SYS_FS_FILE_MAX
+ /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
+ * different, but the operation would fail silently.) */
+ r = sysctl_writef("fs/file-max", "%li", LONG_MAX);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to bump fs.file-max, ignoring: %m");
+#endif
+
+#if BUMP_PROC_SYS_FS_NR_OPEN
+ int v = INT_MAX;
+
+ /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know
+ * what they are. The expression by which the maximum is determined is dependent on the architecture,
+ * and is something we don't really want to copy to userspace, as it is dependent on implementation
+ * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try
+ * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that
+ * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */
+
+ for (;;) {
+ int k;
+
+ v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
+ if (v < 1024) {
+ log_warning("Can't bump fs.nr_open, value too small.");
+ break;
+ }
+
+ k = read_nr_open();
+ if (k < 0) {
+ log_error_errno(k, "Failed to read fs.nr_open: %m");
+ break;
+ }
+ if (k >= v) { /* Already larger */
+ log_debug("Skipping bump, value is already larger.");
+ break;
+ }
+
+ r = sysctl_writef("fs/nr_open", "%i", v);
+ if (r == -EINVAL) {
+ log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
+ v /= 2;
+ continue;
+ }
+ if (r < 0) {
+ log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
+ break;
+ }
+
+ log_debug("Successfully bumped fs.nr_open to %i", v);
+ break;
+ }
+#endif
+}
+
+static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
+ int r, nr;
+
+ /* Get the underlying absolute limit the kernel enforces */
+ nr = read_nr_open();
+
+ /* Calculate the new limits to use for us. Never lower from what we inherited. */
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
+ .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
+ };
+
+ /* Shortcut if nothing changes. */
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
+ log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
+ /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
+ * both hard and soft. */
+ r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
+ if (r < 0)
+ return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
+
+ return 0;
+}
+
+static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
+ uint64_t mm;
+ int r;
+
+ /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
+ * which should normally disable such checks. We need them to implement IPAddressAllow= and
+ * IPAddressDeny=, hence let's bump the value high enough for our user. */
+
+ /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
+ * must be unsigned, hence this is a given, but let's make this clear here. */
+ assert_cc(RLIM_INFINITY > 0);
+
+ mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
+ * physical RAM. We allow an eighth to be locked by us, just to
+ * pick a value. */
+
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
+ .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
+ };
+
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
+ log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
+ r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
+ if (r < 0)
+ return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
+
+ return 0;
+}
+
+static void test_usr(void) {
+
+ /* Check that /usr is either on the same file system as / or mounted already. */
+
+ if (dir_is_empty("/usr", /* ignore_hidden_or_backup= */ false) <= 0)
+ return;
+
+ log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
+ "Some things will probably break (sometimes even silently) in mysterious ways. "
+ "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
+}
+
+static int enforce_syscall_archs(Set *archs) {
+#if HAVE_SECCOMP
+ int r;
+
+ if (!is_seccomp_available())
+ return 0;
+
+ r = seccomp_restrict_archs(arg_syscall_archs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
+#endif
+ return 0;
+}
+
+static int os_release_status(void) {
+ _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
+ *ansi_color = NULL, *support_end = NULL;
+ int r;
+
+ r = parse_os_release(NULL,
+ "PRETTY_NAME", &pretty_name,
+ "NAME", &name,
+ "VERSION", &version,
+ "ANSI_COLOR", &ansi_color,
+ "SUPPORT_END", &support_end);
+ if (r < 0)
+ return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to read os-release file, ignoring: %m");
+
+ const char *label = empty_to_null(pretty_name) ?: empty_to_null(name) ?: "Linux";
+
+ if (show_status_on(arg_show_status)) {
+ if (log_get_show_color())
+ status_printf(NULL, 0,
+ "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
+ empty_to_null(ansi_color) ?: "1",
+ label);
+ else
+ status_printf(NULL, 0,
+ "\nWelcome to %s!\n",
+ label);
+ }
+
+ if (support_end && os_release_support_ended(support_end, false) > 0)
+ /* pretty_name may include the version already, so we'll print the version only if we
+ * have it and we're not using pretty_name. */
+ status_printf(ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, 0,
+ "This OS version (%s%s%s) is past its end-of-support date (%s)",
+ label,
+ (pretty_name || !version) ? "" : " version ",
+ (pretty_name || !version) ? "" : version,
+ support_end);
+
+ return 0;
+}
+
+static int write_container_id(void) {
+ const char *c;
+ int r = 0; /* avoid false maybe-uninitialized warning */
+
+ c = getenv("container");
+ if (isempty(c))
+ return 0;
+
+ RUN_WITH_UMASK(0022)
+ r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
+
+ return 1;
+}
+
+static int bump_unix_max_dgram_qlen(void) {
+ _cleanup_free_ char *qlen = NULL;
+ unsigned long v;
+ int r;
+
+ /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
+ * the value really really early during boot, so that it is actually applied to all our sockets,
+ * including the $NOTIFY_SOCKET one. */
+
+ r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
+ if (r < 0)
+ return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to read AF_UNIX datagram queue length, ignoring: %m");
+
+ r = safe_atolu(qlen, &v);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
+
+ if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
+ return 0;
+
+ r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER,
+ "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN);
+ if (r < 0)
+ return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
+
+ return 1;
+}
+
+static int fixup_environment(void) {
+ _cleanup_free_ char *term = NULL;
+ const char *t;
+ int r;
+
+ /* Only fix up the environment when we are started as PID 1 */
+ if (getpid_cached() != 1)
+ return 0;
+
+ /* We expect the environment to be set correctly if run inside a container. */
+ if (detect_container() > 0)
+ return 0;
+
+ /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
+ * backend device used by the console. We try to make a better guess here since some consoles might
+ * not have support for color mode for example.
+ *
+ * However if TERM was configured through the kernel command line then leave it alone. */
+ r = proc_cmdline_get_key("TERM", 0, &term);
+ if (r < 0)
+ return r;
+
+ t = term ?: default_term_for_tty("/dev/console");
+
+ if (setenv("TERM", t, 1) < 0)
+ return -errno;
+
+ /* The kernels sets HOME=/ for init. Let's undo this. */
+ if (path_equal_ptr(getenv("HOME"), "/"))
+ assert_se(unsetenv("HOME") == 0);
+
+ return 0;
+}
+
+static void redirect_telinit(int argc, char *argv[]) {
+
+ /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
+
+#if HAVE_SYSV_COMPAT
+ if (getpid_cached() == 1)
+ return;
+
+ if (!invoked_as(argv, "init"))
+ return;
+
+ execv(SYSTEMCTL_BINARY_PATH, argv);
+ log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
+ exit(EXIT_FAILURE);
+#endif
+}
+
+static int become_shutdown(
+ const char *shutdown_verb,
+ int retval) {
+
+ char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)],
+ exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)],
+ timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")];
+
+ const char* command_line[14] = {
+ SYSTEMD_SHUTDOWN_BINARY_PATH,
+ shutdown_verb,
+ timeout,
+ log_level,
+ };
+
+ _cleanup_strv_free_ char **env_block = NULL;
+ usec_t watchdog_timer = 0;
+ size_t pos = 4;
+ int r;
+
+ assert(shutdown_verb);
+ assert(!command_line[pos]);
+ env_block = strv_copy(environ);
+
+ xsprintf(log_level, "--log-level=%d", log_get_max_level());
+ xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_default_timeout_stop_usec);
+
+ switch (log_get_target()) {
+
+ case LOG_TARGET_KMSG:
+ case LOG_TARGET_JOURNAL_OR_KMSG:
+ case LOG_TARGET_SYSLOG_OR_KMSG:
+ command_line[pos++] = "--log-target=kmsg";
+ break;
+
+ case LOG_TARGET_NULL:
+ command_line[pos++] = "--log-target=null";
+ break;
+
+ case LOG_TARGET_CONSOLE:
+ default:
+ command_line[pos++] = "--log-target=console";
+ break;
+ };
+
+ if (log_get_show_color())
+ command_line[pos++] = "--log-color";
+
+ if (log_get_show_location())
+ command_line[pos++] = "--log-location";
+
+ if (log_get_show_time())
+ command_line[pos++] = "--log-time";
+
+ if (streq(shutdown_verb, "exit")) {
+ xsprintf(exit_code, "--exit-code=%d", retval);
+ command_line[pos++] = exit_code;
+ }
+
+ assert(pos < ELEMENTSOF(command_line));
+
+ if (streq(shutdown_verb, "reboot"))
+ watchdog_timer = arg_reboot_watchdog;
+ else if (streq(shutdown_verb, "kexec"))
+ watchdog_timer = arg_kexec_watchdog;
+
+ /* If we reboot or kexec let's set the shutdown watchdog and tell the
+ * shutdown binary to repeatedly ping it.
+ * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
+ (void) watchdog_setup_pretimeout(0);
+ (void) watchdog_setup_pretimeout_governor(NULL);
+ r = watchdog_setup(watchdog_timer);
+ watchdog_close(r < 0);
+
+ /* Tell the binary how often to ping, ignore failure */
+ (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
+
+ if (arg_watchdog_device)
+ (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
+
+ /* Avoid the creation of new processes forked by the kernel; at this
+ * point, we will not listen to the signals anyway */
+ if (detect_container() <= 0)
+ (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
+
+ execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
+ return -errno;
+}
+
+static void initialize_clock(void) {
+ int r;
+
+ /* This is called very early on, before we parse the kernel command line or otherwise figure out why
+ * we are running, but only once. */
+
+ if (clock_is_localtime(NULL) > 0) {
+ int min;
+
+ /* The very first call of settimeofday() also does a time warp in the kernel.
+ *
+ * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
+ * take care of maintaining the RTC and do all adjustments. This matches the behavior of
+ * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
+ */
+ r = clock_set_timezone(&min);
+ if (r < 0)
+ log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
+ else
+ log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
+
+ } else if (!in_initrd())
+ /*
+ * Do a dummy very first call to seal the kernel's time warp magic.
+ *
+ * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
+ * LOCAL, but the real system could be set up that way. In such case, we need to delay the
+ * time-warp or the sealing until we reach the real system.
+ *
+ * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
+ * the time will jump or be incorrect at every daylight saving time change. All kernel local
+ * time concepts will be treated as UTC that way.
+ */
+ (void) clock_reset_timewarp();
+
+ ClockChangeDirection change_dir;
+ r = clock_apply_epoch(&change_dir);
+ if (r > 0 && change_dir == CLOCK_CHANGE_FORWARD)
+ log_info("System time before build time, advancing clock.");
+ else if (r > 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+ log_info("System time is further ahead than %s after build time, resetting clock to build time.",
+ FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+ else if (r < 0 && change_dir == CLOCK_CHANGE_FORWARD)
+ log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
+ else if (r < 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+ log_error_errno(r, "Current system time is further ahead %s after build time, but cannot correct: %m",
+ FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+}
+
+static void apply_clock_update(void) {
+ /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
+ * command line and such. */
+
+ if (arg_clock_usec == 0)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
+ log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
+ else
+ log_info("Set system clock to %s, as specified on the kernel command line.",
+ FORMAT_TIMESTAMP(arg_clock_usec));
+}
+
+static void cmdline_take_random_seed(void) {
+ size_t suggested;
+ int r;
+
+ if (arg_random_seed_size == 0)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ assert(arg_random_seed);
+ suggested = random_pool_size();
+
+ if (arg_random_seed_size < suggested)
+ log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
+ arg_random_seed_size, suggested);
+
+ r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
+ return;
+ }
+
+ log_notice("Successfully credited entropy passed on kernel command line.\n"
+ "Note that the seed provided this way is accessible to unprivileged programs. "
+ "This functionality should not be used outside of testing environments.");
+}
+
+static void initialize_coredump(bool skip_setup) {
+#if ENABLE_COREDUMP
+ if (getpid_cached() != 1)
+ return;
+
+ /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
+ * the limit) will process core dumps for system services by default. */
+ if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
+ log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
+
+ /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
+ * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
+ * command line later so core dumps can still be generated during early startup and in initrd. */
+ if (!skip_setup)
+ disable_coredumps();
+#endif
+}
+
+static void initialize_core_pattern(bool skip_setup) {
+ int r;
+
+ if (skip_setup || !arg_early_core_pattern)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
+ arg_early_core_pattern);
+}
+
+static void update_cpu_affinity(bool skip_setup) {
+ _cleanup_free_ char *mask = NULL;
+
+ if (skip_setup || !arg_cpu_affinity.set)
+ return;
+
+ assert(arg_cpu_affinity.allocated > 0);
+
+ mask = cpu_set_to_range_string(&arg_cpu_affinity);
+ log_debug("Setting CPU affinity to {%s}.", strnull(mask));
+
+ if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
+ log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
+}
+
+static void update_numa_policy(bool skip_setup) {
+ int r;
+ _cleanup_free_ char *nodes = NULL;
+ const char * policy = NULL;
+
+ if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
+ return;
+
+ if (DEBUG_LOGGING) {
+ policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
+ nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
+ log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
+ }
+
+ r = apply_numa_policy(&arg_numa_policy);
+ if (r == -EOPNOTSUPP)
+ log_debug_errno(r, "NUMA support not available, ignoring.");
+ else if (r < 0)
+ log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
+}
+
+static void filter_args(
+ const char* dst[],
+ size_t *dst_index,
+ char **src,
+ int argc) {
+
+ assert(dst);
+ assert(dst_index);
+
+ /* Copy some filtered arguments into the dst array from src. */
+ for (int i = 1; i < argc; i++) {
+ if (STR_IN_SET(src[i],
+ "--switched-root",
+ "--system",
+ "--user"))
+ continue;
+
+ if (startswith(src[i], "--deserialize="))
+ continue;
+ if (streq(src[i], "--deserialize")) {
+ i++; /* Skip the argument too */
+ continue;
+ }
+
+ /* Skip target unit designators. We already acted upon this information and have queued
+ * appropriate jobs. We don't want to redo all this after reexecution. */
+ if (startswith(src[i], "--unit="))
+ continue;
+ if (streq(src[i], "--unit")) {
+ i++; /* Skip the argument too */
+ continue;
+ }
+
+ /* Seems we have a good old option. Let's pass it over to the new instance. */
+ dst[(*dst_index)++] = src[i];
+ }
+}
+
+static int do_reexecute(
+ ManagerObjective objective,
+ int argc,
+ char* argv[],
+ const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock,
+ FDSet *fds,
+ const char *switch_root_dir,
+ const char *switch_root_init,
+ const char **ret_error_message) {
+
+ size_t i, args_size;
+ const char **args;
+ int r;
+
+ assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT));
+ assert(argc >= 0);
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+ assert(ret_error_message);
+
+ /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get
+ * rebooted while we do that */
+ watchdog_close(true);
+
+ /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
+ * the kernel default to its child processes */
+ if (saved_rlimit_nofile->rlim_cur != 0)
+ (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
+ if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
+ (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
+
+ if (switch_root_dir) {
+ /* Kill all remaining processes from the initrd, but don't wait for them, so that we can
+ * handle the SIGCHLD for them after deserializing. */
+ broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec);
+
+ /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */
+ r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE);
+ if (r < 0)
+ log_error_errno(r, "Failed to switch root, trying to continue: %m");
+ }
+
+ args_size = argc + 5;
+ args = newa(const char*, args_size);
+
+ if (!switch_root_init) {
+ char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
+
+ /* First try to spawn ourselves with the right path, and with full serialization. We do this
+ * only if the user didn't specify an explicit init to spawn. */
+
+ assert(arg_serialization);
+ assert(fds);
+
+ xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
+
+ i = 1; /* Leave args[0] empty for now. */
+
+ /* Put our stuff first to make sure it always gets parsed in case
+ * we get weird stuff from the kernel cmdline (like --) */
+ if (switch_root_dir)
+ args[i++] = "--switched-root";
+
+ args[i++] = arg_system ? "--system" : "--user";
+ args[i++] = sfd;
+
+ filter_args(args, &i, argv, argc);
+
+ args[i++] = NULL;
+
+ assert(i <= args_size);
+
+ /*
+ * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
+ * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
+ * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
+ * and wait() for it in the parent, before proceeding into the exec().
+ */
+ valgrind_summary_hack();
+
+ args[0] = SYSTEMD_BINARY_PATH;
+ (void) execv(args[0], (char* const*) args);
+
+ if (objective == MANAGER_REEXECUTE) {
+ *ret_error_message = "Failed to execute our own binary";
+ return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
+ }
+
+ log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
+ }
+
+ /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
+ * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
+ * but let's hope that doesn't matter.) */
+
+ arg_serialization = safe_fclose(arg_serialization);
+ fds = fdset_free(fds);
+
+ /* Reopen the console */
+ (void) make_console_stdio();
+
+ i = 1; /* Leave args[0] empty for now. */
+ for (int j = 1; j <= argc; j++)
+ args[i++] = argv[j];
+ assert(i <= args_size);
+
+ /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
+ (void) reset_all_signal_handlers();
+ (void) reset_signal_mask();
+ (void) rlimit_nofile_safe();
+
+ if (switch_root_init) {
+ args[0] = switch_root_init;
+ (void) execve(args[0], (char* const*) args, saved_env);
+ log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
+ }
+
+ args[0] = "/sbin/init";
+ (void) execv(args[0], (char* const*) args);
+ r = -errno;
+
+ manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
+ "Failed to execute /sbin/init");
+
+ *ret_error_message = "Failed to execute fallback shell";
+ if (r == -ENOENT) {
+ log_warning("No /sbin/init, trying fallback");
+
+ args[0] = "/bin/sh";
+ args[1] = NULL;
+ (void) execve(args[0], (char* const*) args, saved_env);
+ return log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
+ } else
+ return log_error_errno(r, "Failed to execute /sbin/init, giving up: %m");
+}
+
+static int invoke_main_loop(
+ Manager *m,
+ const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock,
+ int *ret_retval, /* Return parameters relevant for shutting down */
+ const char **ret_shutdown_verb, /* … */
+ FDSet **ret_fds, /* Return parameters for reexecuting */
+ char **ret_switch_root_dir, /* … */
+ char **ret_switch_root_init, /* … */
+ const char **ret_error_message) {
+
+ int r;
+
+ assert(m);
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+ assert(ret_retval);
+ assert(ret_shutdown_verb);
+ assert(ret_fds);
+ assert(ret_switch_root_dir);
+ assert(ret_switch_root_init);
+ assert(ret_error_message);
+
+ for (;;) {
+ int objective = manager_loop(m);
+ if (objective < 0) {
+ *ret_error_message = "Failed to run main loop";
+ return log_emergency_errno(objective, "Failed to run main loop: %m");
+ }
+
+ switch (objective) {
+
+ case MANAGER_RELOAD: {
+ LogTarget saved_log_target;
+ int saved_log_level;
+
+ log_info("Reloading.");
+
+ /* First, save any overridden log level/target, then parse the configuration file,
+ * which might change the log level to new settings. */
+
+ saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
+ saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
+
+ (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
+
+ set_manager_defaults(m);
+ set_manager_settings(m);
+
+ update_cpu_affinity(false);
+ update_numa_policy(false);
+
+ if (saved_log_level >= 0)
+ manager_override_log_level(m, saved_log_level);
+ if (saved_log_target >= 0)
+ manager_override_log_target(m, saved_log_target);
+
+ if (manager_reload(m) < 0)
+ /* Reloading failed before the point of no return.
+ * Let's continue running as if nothing happened. */
+ m->objective = MANAGER_OK;
+
+ continue;
+ }
+
+ case MANAGER_REEXECUTE:
+ r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
+ if (r < 0) {
+ *ret_error_message = "Failed to prepare for reexecution";
+ return r;
+ }
+
+ log_notice("Reexecuting.");
+
+ *ret_retval = EXIT_SUCCESS;
+ *ret_shutdown_verb = NULL;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+
+ case MANAGER_SWITCH_ROOT:
+ manager_set_switching_root(m, true);
+
+ if (!m->switch_root_init) {
+ r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
+ if (r < 0) {
+ *ret_error_message = "Failed to prepare for reexecution";
+ return r;
+ }
+ } else
+ *ret_fds = NULL;
+
+ log_notice("Switching root.");
+
+ *ret_retval = EXIT_SUCCESS;
+ *ret_shutdown_verb = NULL;
+
+ /* Steal the switch root parameters */
+ *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+ *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
+
+ return objective;
+
+ case MANAGER_EXIT:
+ if (MANAGER_IS_USER(m)) {
+ log_debug("Exit.");
+
+ *ret_retval = m->return_value;
+ *ret_shutdown_verb = NULL;
+ *ret_fds = NULL;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+ }
+
+ _fallthrough_;
+ case MANAGER_REBOOT:
+ case MANAGER_POWEROFF:
+ case MANAGER_HALT:
+ case MANAGER_KEXEC: {
+ static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
+ [MANAGER_EXIT] = "exit",
+ [MANAGER_REBOOT] = "reboot",
+ [MANAGER_POWEROFF] = "poweroff",
+ [MANAGER_HALT] = "halt",
+ [MANAGER_KEXEC] = "kexec",
+ };
+
+ log_notice("Shutting down.");
+
+ *ret_retval = m->return_value;
+ assert_se(*ret_shutdown_verb = table[m->objective]);
+ *ret_fds = NULL;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+ }
+
+ default:
+ assert_not_reached();
+ }
+ }
+}
+
+static void log_execution_mode(bool *ret_first_boot) {
+ bool first_boot = false;
+
+ assert(ret_first_boot);
+
+ if (arg_system) {
+ struct utsname uts;
+ int v;
+
+ log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
+ arg_action == ACTION_TEST ? "test " : "",
+ systemd_features);
+
+ v = detect_virtualization();
+ if (v > 0)
+ log_info("Detected virtualization %s.", virtualization_to_string(v));
+
+ log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
+
+ if (in_initrd())
+ log_info("Running in initrd.");
+ else {
+ int r;
+ _cleanup_free_ char *id_text = NULL;
+
+ /* Let's check whether we are in first boot. First, check if an override was
+ * specified on the kernel commandline. If yes, we honour that. */
+
+ r = proc_cmdline_get_bool("systemd.condition-first-boot", &first_boot);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel commandline argument, ignoring: %m");
+
+ if (r > 0)
+ log_full(first_boot ? LOG_INFO : LOG_DEBUG,
+ "Kernel commandline argument says we are %s first boot.",
+ first_boot ? "in" : "not in");
+ else {
+ /* Second, perform autodetection. We use /etc/machine-id as flag file for
+ * this: If it is missing or contains the value "uninitialized", this is the
+ * first boot. In other cases, it is not. This allows container managers and
+ * installers to provision a couple of files in /etc but still permit the
+ * first-boot initialization to occur. If the container manager wants to
+ * provision the machine ID it should pass $container_uuid to PID 1. */
+
+ r = read_one_line_file("/etc/machine-id", &id_text);
+ if (r < 0 || streq(id_text, "uninitialized")) {
+ if (r < 0 && r != -ENOENT)
+ log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
+
+ first_boot = true;
+ log_info("Detected first boot.");
+ } else
+ log_debug("Detected initialized system, this is not the first boot.");
+ }
+ }
+
+ assert_se(uname(&uts) >= 0);
+
+ if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+ log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
+ "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
+ else
+ log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
+ } else {
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *t = NULL;
+
+ t = uid_to_name(getuid());
+ log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
+ arg_action == ACTION_TEST ? " test" : "",
+ getuid(), strna(t), systemd_features);
+ }
+ }
+
+ *ret_first_boot = first_boot;
+}
+
+static int initialize_runtime(
+ bool skip_setup,
+ bool first_boot,
+ struct rlimit *saved_rlimit_nofile,
+ struct rlimit *saved_rlimit_memlock,
+ const char **ret_error_message) {
+ int r;
+
+ assert(ret_error_message);
+
+ /* Sets up various runtime parameters. Many of these initializations are conditionalized:
+ *
+ * - Some only apply to --system instances
+ * - Some only apply to --user instances
+ * - Some only apply when we first start up, but not when we reexecute
+ */
+
+ if (arg_action != ACTION_RUN)
+ return 0;
+
+ update_cpu_affinity(skip_setup);
+ update_numa_policy(skip_setup);
+
+ if (arg_system) {
+ /* Make sure we leave a core dump without panicking the kernel. */
+ install_crash_handler();
+
+ if (!skip_setup) {
+ r = mount_cgroup_controllers();
+ if (r < 0) {
+ *ret_error_message = "Failed to mount cgroup hierarchies";
+ return r;
+ }
+
+ (void) os_release_status();
+ (void) hostname_setup(true);
+ /* Force transient machine-id on first boot. */
+ machine_id_setup(NULL, /* force_transient= */ first_boot, arg_machine_id, NULL);
+ (void) loopback_setup();
+ bump_unix_max_dgram_qlen();
+ bump_file_max_and_nr_open();
+ test_usr();
+ write_container_id();
+ }
+
+ r = watchdog_set_device(arg_watchdog_device);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
+ } else {
+ _cleanup_free_ char *p = NULL;
+
+ /* Create the runtime directory and place the inaccessible device nodes there, if we run in
+ * user mode. In system mode mount_setup() already did that. */
+
+ r = xdg_user_runtime_dir(&p, "/systemd");
+ if (r < 0) {
+ *ret_error_message = "$XDG_RUNTIME_DIR is not set";
+ return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m");
+ }
+
+ (void) mkdir_p_label(p, 0755);
+ (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
+ }
+
+ if (arg_timer_slack_nsec != NSEC_INFINITY)
+ if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
+ log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
+
+ if (arg_system && !cap_test_all(arg_capability_bounding_set)) {
+ r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
+ if (r < 0) {
+ *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
+ return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m");
+ }
+
+ r = capability_bounding_set_drop(arg_capability_bounding_set, true);
+ if (r < 0) {
+ *ret_error_message = "Failed to drop capability bounding set";
+ return log_emergency_errno(r, "Failed to drop capability bounding set: %m");
+ }
+ }
+
+ if (arg_system && arg_no_new_privs) {
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+ *ret_error_message = "Failed to disable new privileges";
+ return log_emergency_errno(errno, "Failed to disable new privileges: %m");
+ }
+ }
+
+ if (arg_syscall_archs) {
+ r = enforce_syscall_archs(arg_syscall_archs);
+ if (r < 0) {
+ *ret_error_message = "Failed to set syscall architectures";
+ return r;
+ }
+ }
+
+ if (!arg_system)
+ /* Become reaper of our children */
+ if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
+ log_warning_errno(errno, "Failed to make us a subreaper, ignoring: %m");
+
+ /* Bump up RLIMIT_NOFILE for systemd itself */
+ (void) bump_rlimit_nofile(saved_rlimit_nofile);
+ (void) bump_rlimit_memlock(saved_rlimit_memlock);
+
+ /* Pull credentials from various sources into a common credential directory */
+ if (arg_system && !skip_setup)
+ (void) import_credentials();
+
+ return 0;
+}
+
+static int do_queue_default_job(
+ Manager *m,
+ const char **ret_error_message) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ const char *unit;
+ Job *job;
+ Unit *target;
+ int r;
+
+ if (arg_default_unit)
+ unit = arg_default_unit;
+ else if (in_initrd())
+ unit = SPECIAL_INITRD_TARGET;
+ else
+ unit = SPECIAL_DEFAULT_TARGET;
+
+ log_debug("Activating default unit: %s", unit);
+
+ r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
+ if (r < 0 && in_initrd() && !arg_default_unit) {
+ /* Fall back to default.target, which we used to always use by default. Only do this if no
+ * explicit configuration was given. */
+
+ log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
+
+ r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
+ }
+ if (r < 0) {
+ log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
+
+ r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
+ if (r < 0) {
+ *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
+ : "Failed to load " SPECIAL_RESCUE_TARGET;
+ return r;
+ }
+ }
+
+ assert(target->load_state == UNIT_LOADED);
+
+ r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
+ if (r == -EPERM) {
+ log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
+
+ sd_bus_error_free(&error);
+
+ r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0) {
+ *ret_error_message = "Failed to start default target";
+ return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r));
+ }
+
+ } else if (r < 0) {
+ *ret_error_message = "Failed to isolate default target";
+ return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r));
+ } else
+ log_info("Queued %s job for default target %s.",
+ job_type_to_string(job->type),
+ unit_status_string(job->unit, NULL));
+
+ m->default_unit_job_id = job->id;
+
+ return 0;
+}
+
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
+ struct rlimit *saved_rlimit_memlock) {
+
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+
+ if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
+ log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
+
+ if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
+ log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
+}
+
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
+ struct rlimit *rl;
+
+ if (arg_default_rlimit[RLIMIT_NOFILE])
+ return;
+
+ /* Make sure forked processes get limits based on the original kernel setting */
+
+ rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
+ if (!rl) {
+ log_oom();
+ return;
+ }
+
+ /* Bump the hard limit for system services to a substantially higher value. The default
+ * hard limit current kernels set is pretty low (4K), mostly for historical
+ * reasons. According to kernel developers, the fd handling in recent kernels has been
+ * optimized substantially enough, so that we can bump the limit now, without paying too
+ * high a price in memory or performance. Note however that we only bump the hard limit,
+ * not the soft limit. That's because select() works the way it works, and chokes on fds
+ * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
+ * unexpecting programs that they get fds higher than what they can process using
+ * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
+ * this pitfall: programs that are written by folks aware of the select() problem in mind
+ * (and thus use poll()/epoll instead of select(), the way everybody should) can
+ * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
+ * we pass. */
+ if (arg_system) {
+ int nr;
+
+ /* Get the underlying absolute limit the kernel enforces */
+ nr = read_nr_open();
+
+ rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
+ }
+
+ /* If for some reason we were invoked with a soft limit above 1024 (which should never
+ * happen!, but who knows what we get passed in from pam_limit when invoked as --user
+ * instance), then lower what we pass on to not confuse our children */
+ rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
+
+ arg_default_rlimit[RLIMIT_NOFILE] = rl;
+}
+
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
+ struct rlimit *rl;
+
+ /* Pass the original value down to invoked processes */
+
+ if (arg_default_rlimit[RLIMIT_MEMLOCK])
+ return;
+
+ rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
+ if (!rl) {
+ log_oom();
+ return;
+ }
+
+ if (arg_system) {
+ /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
+ * default for this since kernel 5.16) */
+ rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+ rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+ }
+
+ arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
+}
+
+static void setenv_manager_environment(void) {
+ int r;
+
+ STRV_FOREACH(p, arg_manager_environment) {
+ log_debug("Setting '%s' in our own environment.", *p);
+
+ r = putenv_dup(*p, true);
+ if (r < 0)
+ log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
+ }
+}
+
+static void reset_arguments(void) {
+ /* Frees/resets arg_* variables, with a few exceptions commented below. */
+
+ arg_default_unit = mfree(arg_default_unit);
+
+ /* arg_system — ignore */
+
+ arg_dump_core = true;
+ arg_crash_chvt = -1;
+ arg_crash_shell = false;
+ arg_crash_reboot = false;
+ arg_confirm_spawn = mfree(arg_confirm_spawn);
+ arg_show_status = _SHOW_STATUS_INVALID;
+ arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
+ arg_switched_root = false;
+ arg_pager_flags = 0;
+ arg_service_watchdogs = true;
+ arg_default_std_output = EXEC_OUTPUT_JOURNAL;
+ arg_default_std_error = EXEC_OUTPUT_INHERIT;
+ arg_default_restart_usec = DEFAULT_RESTART_USEC;
+ arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_abort_set = false;
+ arg_default_device_timeout_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
+ arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
+ arg_runtime_watchdog = 0;
+ arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
+ arg_kexec_watchdog = 0;
+ arg_pretimeout_watchdog = 0;
+ arg_early_core_pattern = mfree(arg_early_core_pattern);
+ arg_watchdog_device = mfree(arg_watchdog_device);
+ arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+
+ arg_default_environment = strv_free(arg_default_environment);
+ arg_manager_environment = strv_free(arg_manager_environment);
+ rlimit_free_all(arg_default_rlimit);
+
+ arg_capability_bounding_set = CAP_ALL;
+ arg_no_new_privs = false;
+ arg_timer_slack_nsec = NSEC_INFINITY;
+ arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
+
+ arg_syscall_archs = set_free(arg_syscall_archs);
+
+ /* arg_serialization — ignore */
+
+ arg_default_cpu_accounting = -1;
+ arg_default_io_accounting = false;
+ arg_default_ip_accounting = false;
+ arg_default_blockio_accounting = false;
+ arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
+ arg_default_tasks_accounting = true;
+ arg_default_tasks_max = DEFAULT_TASKS_MAX;
+ arg_machine_id = (sd_id128_t) {};
+ arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+ arg_default_oom_policy = OOM_STOP;
+
+ cpu_set_reset(&arg_cpu_affinity);
+ numa_policy_reset(&arg_numa_policy);
+
+ arg_random_seed = mfree(arg_random_seed);
+ arg_random_seed_size = 0;
+ arg_clock_usec = 0;
+
+ arg_default_oom_score_adjust_set = false;
+ arg_default_smack_process_label = mfree(arg_default_smack_process_label);
+}
+
+static void determine_default_oom_score_adjust(void) {
+ int r, a, b;
+
+ /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
+ * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
+ * user). */
+
+ if (arg_default_oom_score_adjust_set)
+ return;
+
+ if (getuid() == 0)
+ return;
+
+ r = get_oom_score_adjust(&a);
+ if (r < 0)
+ return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
+
+ assert_cc(100 <= OOM_SCORE_ADJ_MAX);
+ b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
+
+ if (a == b)
+ return;
+
+ arg_default_oom_score_adjust = b;
+ arg_default_oom_score_adjust_set = true;
+}
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock) {
+ int r;
+
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+
+ /* Assign configuration defaults */
+ reset_arguments();
+
+ r = parse_config_file();
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse config file, ignoring: %m");
+
+ if (arg_system) {
+ r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+ }
+
+ /* Initialize some default rlimits for services if they haven't been configured */
+ fallback_rlimit_nofile(saved_rlimit_nofile);
+ fallback_rlimit_memlock(saved_rlimit_memlock);
+
+ /* Note that this also parses bits from the kernel command line, including "debug". */
+ log_parse_environment();
+
+ /* Initialize the show status setting if it hasn't been set explicitly yet */
+ if (arg_show_status == _SHOW_STATUS_INVALID)
+ arg_show_status = SHOW_STATUS_YES;
+
+ /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
+ determine_default_oom_score_adjust();
+
+ /* Push variables into the manager environment block */
+ setenv_manager_environment();
+
+ /* Parse log environment variables again to take into account any new environment variables. */
+ log_parse_environment();
+
+ return 0;
+}
+
+static int safety_checks(void) {
+
+ if (getpid_cached() == 1 &&
+ arg_action != ACTION_RUN)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Unsupported execution mode while PID 1.");
+
+ if (getpid_cached() == 1 &&
+ !arg_system)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Can't run --user mode as PID 1.");
+
+ if (arg_action == ACTION_RUN &&
+ arg_system &&
+ getpid_cached() != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Can't run system mode unless PID 1.");
+
+ if (arg_action == ACTION_TEST &&
+ geteuid() == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Don't run test mode as root.");
+
+ if (!arg_system &&
+ arg_action == ACTION_RUN &&
+ sd_booted() <= 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Trying to run as user instance, but the system has not been booted with systemd.");
+
+ if (!arg_system &&
+ arg_action == ACTION_RUN &&
+ !getenv("XDG_RUNTIME_DIR"))
+ return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
+ "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
+
+ if (arg_system &&
+ arg_action == ACTION_RUN &&
+ running_in_chroot() > 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Cannot be run in a chroot() environment.");
+
+ return 0;
+}
+
+static int initialize_security(
+ bool *loaded_policy,
+ dual_timestamp *security_start_timestamp,
+ dual_timestamp *security_finish_timestamp,
+ const char **ret_error_message) {
+
+ int r;
+
+ assert(loaded_policy);
+ assert(security_start_timestamp);
+ assert(security_finish_timestamp);
+ assert(ret_error_message);
+
+ dual_timestamp_get(security_start_timestamp);
+
+ r = mac_selinux_setup(loaded_policy);
+ if (r < 0) {
+ *ret_error_message = "Failed to load SELinux policy";
+ return r;
+ }
+
+ r = mac_smack_setup(loaded_policy);
+ if (r < 0) {
+ *ret_error_message = "Failed to load SMACK policy";
+ return r;
+ }
+
+ r = mac_apparmor_setup();
+ if (r < 0) {
+ *ret_error_message = "Failed to load AppArmor policy";
+ return r;
+ }
+
+ r = ima_setup();
+ if (r < 0) {
+ *ret_error_message = "Failed to load IMA policy";
+ return r;
+ }
+
+ dual_timestamp_get(security_finish_timestamp);
+ return 0;
+}
+
+static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
+ int r;
+
+ assert(ret_fds);
+ assert(ret_error_message);
+
+ /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
+ * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
+ * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
+ * mechanism to distinguish passed in fds from our own.
+ *
+ * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
+ * process behind our back. We should not take possession of that (and then accidentally close
+ * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
+ r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
+ if (r < 0) {
+ *ret_error_message = "Failed to allocate fd set";
+ return log_emergency_errno(r, "Failed to allocate fd set: %m");
+ }
+
+ /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
+ assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
+
+ return 0;
+}
+
+static void setup_console_terminal(bool skip_setup) {
+
+ if (!arg_system)
+ return;
+
+ /* Become a session leader if we aren't one yet. */
+ (void) setsid();
+
+ /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
+ * controlling tty. */
+ (void) release_terminal();
+
+ /* Reset the console, but only if this is really init and we are freshly booted */
+ if (getpid_cached() == 1 && !skip_setup)
+ (void) console_setup();
+}
+
+static bool early_skip_setup_check(int argc, char *argv[]) {
+ bool found_deserialize = false;
+
+ /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
+ * later, so let's just have a quick peek here. Note that if we have switched root, do all the
+ * special setup things anyway, even if in that case we also do deserialization. */
+
+ for (int i = 1; i < argc; i++)
+ if (streq(argv[i], "--switched-root"))
+ return false; /* If we switched root, don't skip the setup. */
+ else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
+ found_deserialize = true;
+
+ return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
+}
+
+static int save_env(void) {
+ char **l;
+
+ l = strv_copy(environ);
+ if (!l)
+ return -ENOMEM;
+
+ strv_free_and_replace(saved_env, l);
+ return 0;
+}
+
+int main(int argc, char *argv[]) {
+ dual_timestamp
+ initrd_timestamp = DUAL_TIMESTAMP_NULL,
+ userspace_timestamp = DUAL_TIMESTAMP_NULL,
+ kernel_timestamp = DUAL_TIMESTAMP_NULL,
+ security_start_timestamp = DUAL_TIMESTAMP_NULL,
+ security_finish_timestamp = DUAL_TIMESTAMP_NULL;
+ struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
+ saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
+ * in. Note we use different values
+ * for the two that indicate whether
+ * these fields are initialized! */
+ bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
+ char *switch_root_dir = NULL, *switch_root_init = NULL;
+ usec_t before_startup, after_startup;
+ static char systemd[] = "systemd";
+ const char *shutdown_verb = NULL, *error_message = NULL;
+ int r, retval = EXIT_FAILURE;
+ Manager *m = NULL;
+ FDSet *fds = NULL;
+
+ assert_se(argc > 0 && !isempty(argv[0]));
+
+ /* SysV compatibility: redirect init → telinit */
+ redirect_telinit(argc, argv);
+
+ /* Take timestamps early on */
+ dual_timestamp_from_monotonic(&kernel_timestamp, 0);
+ dual_timestamp_get(&userspace_timestamp);
+
+ /* Figure out whether we need to do initialize the system, or if we already did that because we are
+ * reexecuting. */
+ skip_setup = early_skip_setup_check(argc, argv);
+
+ /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
+ * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
+ * right-away. */
+ program_invocation_short_name = systemd;
+ (void) prctl(PR_SET_NAME, systemd);
+
+ /* Save the original command line */
+ save_argc_argv(argc, argv);
+
+ /* Save the original environment as we might need to restore it if we're requested to execute another
+ * system manager later. */
+ r = save_env();
+ if (r < 0) {
+ error_message = "Failed to copy environment block";
+ goto finish;
+ }
+
+ /* Make sure that if the user says "syslog" we actually log to the journal. */
+ log_set_upgrade_syslog_to_journal(true);
+
+ if (getpid_cached() == 1) {
+ /* When we run as PID 1 force system mode */
+ arg_system = true;
+
+ /* Disable the umask logic */
+ umask(0);
+
+ /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
+ * not be activated yet (even though the log socket for it exists). */
+ log_set_prohibit_ipc(true);
+
+ /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
+ * is important so that we never end up logging to any foreign stderr, for example if we have
+ * to log in a child process right before execve()'ing the actual binary, at a point in time
+ * where socket activation stderr/stdout area already set up. */
+ log_set_always_reopen_console(true);
+
+ if (detect_container() <= 0) {
+
+ /* Running outside of a container as PID 1 */
+ log_set_target(LOG_TARGET_KMSG);
+ log_open();
+
+ if (in_initrd())
+ initrd_timestamp = userspace_timestamp;
+
+ if (!skip_setup) {
+ r = mount_setup_early();
+ if (r < 0) {
+ error_message = "Failed to mount early API filesystems";
+ goto finish;
+ }
+
+ /* Let's open the log backend a second time, in case the first time didn't
+ * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
+ * available, and it previously wasn't. */
+ log_open();
+
+ disable_printk_ratelimit();
+
+ r = initialize_security(
+ &loaded_policy,
+ &security_start_timestamp,
+ &security_finish_timestamp,
+ &error_message);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (mac_selinux_init() < 0) {
+ error_message = "Failed to initialize SELinux support";
+ goto finish;
+ }
+
+ if (!skip_setup)
+ initialize_clock();
+
+ /* Set the default for later on, but don't actually open the logs like this for
+ * now. Note that if we are transitioning from the initrd there might still be
+ * journal fd open, and we shouldn't attempt opening that before we parsed
+ * /proc/cmdline which might redirect output elsewhere. */
+ log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
+
+ } else {
+ /* Running inside a container, as PID 1 */
+ log_set_target(LOG_TARGET_CONSOLE);
+ log_open();
+
+ /* For later on, see above... */
+ log_set_target(LOG_TARGET_JOURNAL);
+
+ /* clear the kernel timestamp, because we are in a container */
+ kernel_timestamp = DUAL_TIMESTAMP_NULL;
+ }
+
+ initialize_coredump(skip_setup);
+
+ r = fixup_environment();
+ if (r < 0) {
+ log_emergency_errno(r, "Failed to fix up PID 1 environment: %m");
+ error_message = "Failed to fix up PID1 environment";
+ goto finish;
+ }
+
+ /* Try to figure out if we can use colors with the console. No need to do that for user
+ * instances since they never log into the console. */
+ log_show_color(colors_enabled());
+
+ r = make_null_stdio();
+ if (r < 0)
+ log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
+
+ /* Load the kernel modules early. */
+ if (!skip_setup)
+ (void) kmod_setup();
+
+ /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
+ r = mount_setup(loaded_policy, skip_setup);
+ if (r < 0) {
+ error_message = "Failed to mount API filesystems";
+ goto finish;
+ }
+
+ /* The efivarfs is now mounted, let's read the random seed off it */
+ (void) efi_take_random_seed();
+
+ /* Cache command-line options passed from EFI variables */
+ if (!skip_setup)
+ (void) cache_efi_options_variable();
+ } else {
+ /* Running as user instance */
+ arg_system = false;
+ log_set_always_reopen_console(true);
+ log_set_target(LOG_TARGET_AUTO);
+ log_open();
+
+ /* clear the kernel timestamp, because we are not PID 1 */
+ kernel_timestamp = DUAL_TIMESTAMP_NULL;
+
+ /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
+ * not affect the permitted and effective sets which are important for the manager itself to
+ * operate. */
+ capability_ambient_set_apply(0, /* also_inherit= */ false);
+
+ if (mac_selinux_init() < 0) {
+ error_message = "Failed to initialize SELinux support";
+ goto finish;
+ }
+ }
+
+ /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
+ * transitioning from the initrd to the main systemd or suchlike. */
+ save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+ /* Reset all signal handlers. */
+ (void) reset_all_signal_handlers();
+ (void) ignore_signals(SIGNALS_IGNORE);
+
+ (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+ r = parse_argv(argc, argv);
+ if (r < 0) {
+ error_message = "Failed to parse commandline arguments";
+ goto finish;
+ }
+
+ r = safety_checks();
+ if (r < 0)
+ goto finish;
+
+ if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
+ pager_open(arg_pager_flags);
+
+ if (arg_action != ACTION_RUN)
+ skip_setup = true;
+
+ if (arg_action == ACTION_HELP) {
+ retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_VERSION) {
+ retval = version();
+ goto finish;
+ } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
+ unit_dump_config_items(stdout);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
+ dump_bus_properties(stdout);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_BUS_INTROSPECT) {
+ r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
+ retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+ goto finish;
+ }
+
+ assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
+
+ /* Move out of the way, so that we won't block unmounts */
+ assert_se(chdir("/") == 0);
+
+ if (arg_action == ACTION_RUN) {
+ if (!skip_setup) {
+ /* Apply the systemd.clock_usec= kernel command line switch */
+ apply_clock_update();
+
+ /* Apply random seed from kernel command line */
+ cmdline_take_random_seed();
+ }
+
+ /* A core pattern might have been specified via the cmdline. */
+ initialize_core_pattern(skip_setup);
+
+ /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
+ log_close();
+
+ /* Remember open file descriptors for later deserialization */
+ r = collect_fds(&fds, &error_message);
+ if (r < 0)
+ goto finish;
+
+ /* Give up any control of the console, but make sure its initialized. */
+ setup_console_terminal(skip_setup);
+
+ /* Open the logging devices, if possible and necessary */
+ log_open();
+ }
+
+ log_execution_mode(&first_boot);
+
+ r = initialize_runtime(skip_setup,
+ first_boot,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ &error_message);
+ if (r < 0)
+ goto finish;
+
+ r = manager_new(arg_system ? LOOKUP_SCOPE_SYSTEM : LOOKUP_SCOPE_USER,
+ arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
+ &m);
+ if (r < 0) {
+ log_emergency_errno(r, "Failed to allocate manager object: %m");
+ error_message = "Failed to allocate manager object";
+ goto finish;
+ }
+
+ m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
+ m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
+ m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
+ m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
+ m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
+
+ set_manager_defaults(m);
+ set_manager_settings(m);
+ manager_set_first_boot(m, first_boot);
+ manager_set_switching_root(m, arg_switched_root);
+
+ /* Remember whether we should queue the default job */
+ queue_default_job = !arg_serialization || arg_switched_root;
+
+ before_startup = now(CLOCK_MONOTONIC);
+
+ r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
+ if (r < 0) {
+ error_message = "Failed to start up manager";
+ goto finish;
+ }
+
+ /* This will close all file descriptors that were opened, but not claimed by any unit. */
+ fds = fdset_free(fds);
+ arg_serialization = safe_fclose(arg_serialization);
+
+ if (queue_default_job) {
+ r = do_queue_default_job(m, &error_message);
+ if (r < 0)
+ goto finish;
+ }
+
+ after_startup = now(CLOCK_MONOTONIC);
+
+ log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
+ "Loaded units and determined initial transaction in %s.",
+ FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
+
+ if (arg_action == ACTION_TEST) {
+ manager_test_summary(m);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ }
+
+ r = invoke_main_loop(m,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ &retval,
+ &shutdown_verb,
+ &fds,
+ &switch_root_dir,
+ &switch_root_init,
+ &error_message);
+ assert(r < 0 || IN_SET(r, MANAGER_EXIT, /* MANAGER_OK is not expected here. */
+ MANAGER_RELOAD,
+ MANAGER_REEXECUTE,
+ MANAGER_REBOOT,
+ MANAGER_POWEROFF,
+ MANAGER_HALT,
+ MANAGER_KEXEC,
+ MANAGER_SWITCH_ROOT));
+
+finish:
+ pager_close();
+
+ if (m) {
+ arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
+ arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
+ m = manager_free(m);
+ }
+
+ mac_selinux_finish();
+
+ if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT))
+ r = do_reexecute(r,
+ argc, argv,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ fds,
+ switch_root_dir,
+ switch_root_init,
+ &error_message); /* This only returns if reexecution failed */
+
+ arg_serialization = safe_fclose(arg_serialization);
+ fds = fdset_free(fds);
+
+ saved_env = strv_free(saved_env);
+
+#if HAVE_VALGRIND_VALGRIND_H
+ /* If we are PID 1 and running under valgrind, then let's exit
+ * here explicitly. valgrind will only generate nice output on
+ * exit(), not on exec(), hence let's do the former not the
+ * latter here. */
+ if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
+ /* Cleanup watchdog_device strings for valgrind. We need them
+ * in become_shutdown() so normally we cannot free them yet. */
+ watchdog_free_device();
+ reset_arguments();
+ return retval;
+ }
+#endif
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+ __lsan_do_leak_check();
+#endif
+
+ /* Try to invoke the shutdown binary unless we already failed.
+ * If we failed above, we want to freeze after finishing cleanup. */
+ if (r >= 0 && shutdown_verb) {
+ r = become_shutdown(shutdown_verb, retval);
+ log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
+ error_message = "Failed to execute shutdown binary";
+ }
+
+ watchdog_free_device();
+ arg_watchdog_device = mfree(arg_watchdog_device);
+
+ if (getpid_cached() == 1) {
+ if (error_message)
+ manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
+ "%s.", error_message);
+ freeze_or_exit_or_reboot();
+ }
+
+ reset_arguments();
+ return retval;
+}
diff --git a/src/core/main.h b/src/core/main.h
new file mode 100644
index 0000000..b12a1cc
--- /dev/null
+++ b/src/core/main.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+extern bool arg_dump_core;
+extern int arg_crash_chvt;
+extern bool arg_crash_shell;
+extern bool arg_crash_reboot;
diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c
new file mode 100644
index 0000000..5a92356
--- /dev/null
+++ b/src/core/manager-dump.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "build.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hashmap.h"
+#include "manager-dump.h"
+#include "unit-serialize.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix) {
+ Job *j;
+
+ assert(s);
+ assert(f);
+
+ HASHMAP_FOREACH(j, s->jobs) {
+
+ if (!strv_fnmatch_or_empty(patterns, j->unit->id, FNM_NOESCAPE))
+ continue;
+
+ job_dump(j, f, prefix);
+ }
+}
+
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix) {
+ Unit *u;
+ const char *t;
+
+ assert(s);
+ assert(f);
+
+ HASHMAP_FOREACH_KEY(u, t, s->units) {
+ if (u->id != t)
+ continue;
+
+ if (!strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+ continue;
+
+ unit_dump(u, f, prefix);
+ }
+}
+
+static void manager_dump_header(Manager *m, FILE *f, const char *prefix) {
+
+ /* NB: this is a debug interface for developers. It's not supposed to be machine readable or be
+ * stable between versions. We take the liberty to restructure it entirely between versions and
+ * add/remove fields at will. */
+
+ fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix));
+ fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features);
+
+ for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ const dual_timestamp *t = m->timestamps + q;
+
+ if (dual_timestamp_is_set(t))
+ fprintf(f, "%sTimestamp %s: %s\n",
+ strempty(prefix),
+ manager_timestamp_to_string(q),
+ timestamp_is_set(t->realtime) ? FORMAT_TIMESTAMP(t->realtime) :
+ FORMAT_TIMESPAN(t->monotonic, 1));
+ }
+}
+
+void manager_dump(Manager *m, FILE *f, char **patterns, const char *prefix) {
+ assert(m);
+ assert(f);
+
+ /* If no pattern is provided, dump the full manager state including the manager version, features and
+ * so on. Otherwise limit the dump to the units/jobs matching the specified patterns. */
+ if (!patterns)
+ manager_dump_header(m, f, prefix);
+
+ manager_dump_units(m, f, patterns, prefix);
+ manager_dump_jobs(m, f, patterns, prefix);
+}
+
+int manager_get_dump_string(Manager *m, char **patterns, char **ret) {
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t size;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ f = open_memstream_unlocked(&dump, &size);
+ if (!f)
+ return -errno;
+
+ manager_dump(m, f, patterns, NULL);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ f = safe_fclose(f);
+
+ *ret = TAKE_PTR(dump);
+
+ return 0;
+}
+
+void manager_test_summary(Manager *m) {
+ assert(m);
+
+ printf("-> By units:\n");
+ manager_dump_units(m, stdout, /* patterns= */ NULL, "\t");
+
+ printf("-> By jobs:\n");
+ manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t");
+}
diff --git a/src/core/manager-dump.h b/src/core/manager-dump.h
new file mode 100644
index 0000000..714a1f0
--- /dev/null
+++ b/src/core/manager-dump.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "manager.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix);
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix);
+void manager_dump(Manager *s, FILE *f, char **patterns, const char *prefix);
+int manager_get_dump_string(Manager *m, char **patterns, char **ret);
+void manager_test_summary(Manager *m);
diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c
new file mode 100644
index 0000000..3beb8fd
--- /dev/null
+++ b/src/core/manager-serialize.c
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "clean-ipc.h"
+#include "core-varlink.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "macro.h"
+#include "manager-serialize.h"
+#include "manager.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "syslog-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "varlink-internal.h"
+
+int manager_open_serialization(Manager *m, FILE **ret_f) {
+ _cleanup_close_ int fd = -1;
+ FILE *f;
+
+ assert(ret_f);
+
+ fd = open_serialization_fd("systemd-state");
+ if (fd < 0)
+ return fd;
+
+ f = take_fdopen(&fd, "w+");
+ if (!f)
+ return -errno;
+
+ *ret_f = f;
+ return 0;
+}
+
+static bool manager_timestamp_shall_serialize(ManagerTimestamp t) {
+ if (!in_initrd())
+ return true;
+
+ /* The following timestamps only apply to the host system, hence only serialize them there */
+ return !IN_SET(t,
+ MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH,
+ MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH);
+}
+
+static void manager_serialize_uid_refs_internal(
+ FILE *f,
+ Hashmap *uid_refs,
+ const char *field_name) {
+
+ void *p, *k;
+
+ assert(f);
+ assert(field_name);
+
+ /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as
+ * the actual counter of it is better rebuild after a reload/reexec. */
+
+ HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+ uint32_t c;
+ uid_t uid;
+
+ uid = PTR_TO_UID(k);
+ c = PTR_TO_UINT32(p);
+
+ if (!(c & DESTROY_IPC_FLAG))
+ continue;
+
+ (void) serialize_item_format(f, field_name, UID_FMT, uid);
+ }
+}
+
+static void manager_serialize_uid_refs(Manager *m, FILE *f) {
+ manager_serialize_uid_refs_internal(f, m->uid_refs, "destroy-ipc-uid");
+}
+
+static void manager_serialize_gid_refs(Manager *m, FILE *f) {
+ manager_serialize_uid_refs_internal(f, m->gid_refs, "destroy-ipc-gid");
+}
+
+int manager_serialize(
+ Manager *m,
+ FILE *f,
+ FDSet *fds,
+ bool switching_root) {
+
+ const char *t;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+ (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id);
+ (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs);
+ (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs);
+ (void) serialize_bool(f, "taint-usr", m->taint_usr);
+ (void) serialize_bool(f, "ready-sent", m->ready_sent);
+ (void) serialize_bool(f, "taint-logged", m->taint_logged);
+ (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs);
+
+ if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+ (void) serialize_item(f, "show-status-overridden",
+ show_status_to_string(m->show_status_overridden));
+
+ if (m->log_level_overridden)
+ (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level());
+ if (m->log_target_overridden)
+ (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target()));
+
+ (void) serialize_usec(f, "runtime-watchdog-overridden", m->watchdog_overridden[WATCHDOG_RUNTIME]);
+ (void) serialize_usec(f, "reboot-watchdog-overridden", m->watchdog_overridden[WATCHDOG_REBOOT]);
+ (void) serialize_usec(f, "kexec-watchdog-overridden", m->watchdog_overridden[WATCHDOG_KEXEC]);
+ (void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]);
+ (void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden);
+
+ for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (!manager_timestamp_shall_serialize(q))
+ continue;
+
+ joined = strjoin(manager_timestamp_to_string(q), "-timestamp");
+ if (!joined)
+ return log_oom();
+
+ (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
+ }
+
+ if (!switching_root)
+ (void) serialize_strv(f, "env", m->client_environment);
+
+ if (m->notify_fd >= 0) {
+ r = serialize_fd(f, fds, "notify-fd", m->notify_fd);
+ if (r < 0)
+ return r;
+
+ (void) serialize_item(f, "notify-socket", m->notify_socket);
+ }
+
+ if (m->cgroups_agent_fd >= 0) {
+ r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd);
+ if (r < 0)
+ return r;
+ }
+
+ if (m->user_lookup_fds[0] >= 0) {
+ int copy0, copy1;
+
+ copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]);
+ if (copy0 < 0)
+ return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m");
+
+ copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]);
+ if (copy1 < 0)
+ return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m");
+
+ (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1);
+ }
+
+ (void) serialize_item_format(f,
+ "dump-ratelimit",
+ USEC_FMT " " USEC_FMT " %u %u",
+ m->dump_ratelimit.begin,
+ m->dump_ratelimit.interval,
+ m->dump_ratelimit.num,
+ m->dump_ratelimit.burst);
+
+ bus_track_serialize(m->subscribed, f, "subscribed");
+
+ r = dynamic_user_serialize(m, f, fds);
+ if (r < 0)
+ return r;
+
+ manager_serialize_uid_refs(m, f);
+ manager_serialize_gid_refs(m, f);
+
+ r = exec_runtime_serialize(m, f, fds);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_serialize(m->varlink_server, f, fds);
+ if (r < 0)
+ return r;
+
+ (void) fputc('\n', f);
+
+ HASHMAP_FOREACH_KEY(u, t, m->units) {
+ if (u->id != t)
+ continue;
+
+ r = unit_serialize(u, f, fds, switching_root);
+ if (r < 0)
+ return r;
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to flush serialization: %m");
+
+ r = bus_fdset_add_all(m, fds);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add bus sockets to serialization: %m");
+
+ return 0;
+}
+
+static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) {
+ Unit *u;
+ int r;
+
+ r = manager_load_unit(m, name, NULL, NULL, &u);
+ if (r < 0) {
+ if (r == -ENOMEM)
+ return r;
+ return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name);
+ }
+
+ r = unit_deserialize(u, f, fds);
+ if (r < 0) {
+ if (r == -ENOMEM)
+ return r;
+ return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name);
+ }
+
+ return 0;
+}
+
+static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) {
+ const char *unit_name;
+ int r;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ /* Start marker */
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ break;
+
+ unit_name = strstrip(line);
+
+ r = manager_deserialize_one_unit(m, unit_name, f, fds);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ r = unit_deserialize_skip(f);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static void manager_deserialize_uid_refs_one_internal(
+ Hashmap** uid_refs,
+ const char *value) {
+
+ uid_t uid;
+ uint32_t c;
+ int r;
+
+ assert(uid_refs);
+ assert(value);
+
+ r = parse_uid(value, &uid);
+ if (r < 0 || uid == 0) {
+ log_debug("Unable to parse UID/GID reference serialization: %s", value);
+ return;
+ }
+
+ if (hashmap_ensure_allocated(uid_refs, &trivial_hash_ops) < 0) {
+ log_oom();
+ return;
+ }
+
+ c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+ if (c & DESTROY_IPC_FLAG)
+ return;
+
+ c |= DESTROY_IPC_FLAG;
+
+ r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add UID/GID reference entry: %m");
+ return;
+ }
+}
+
+static void manager_deserialize_uid_refs_one(Manager *m, const char *value) {
+ manager_deserialize_uid_refs_one_internal(&m->uid_refs, value);
+}
+
+static void manager_deserialize_gid_refs_one(Manager *m, const char *value) {
+ manager_deserialize_uid_refs_one_internal(&m->gid_refs, value);
+}
+
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
+ bool deserialize_varlink_sockets = false;
+ int r = 0;
+
+ assert(m);
+ assert(f);
+
+ if (DEBUG_LOGGING) {
+ if (fdset_isempty(fds))
+ log_debug("No file descriptors passed");
+ else {
+ int fd;
+
+ FDSET_FOREACH(fd, fds) {
+ _cleanup_free_ char *fn = NULL;
+
+ r = fd_get_path(fd, &fn);
+ if (r < 0)
+ log_debug_errno(r, "Received serialized fd %i %s %m",
+ fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT));
+ else
+ log_debug("Received serialized fd %i %s %s",
+ fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(fn));
+ }
+ }
+ }
+
+ log_debug("Deserializing state...");
+
+ /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have
+ * increased it to non-zero, which is why we just increase it by one here and down again at the end of this
+ * call. */
+ _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ const char *val, *l;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ break;
+
+ l = strstrip(line);
+ if (isempty(l)) /* end marker */
+ break;
+
+ if ((val = startswith(l, "current-job-id="))) {
+ uint32_t id;
+
+ if (safe_atou32(val, &id) < 0)
+ log_notice("Failed to parse current job id value '%s', ignoring.", val);
+ else
+ m->current_job_id = MAX(m->current_job_id, id);
+
+ } else if ((val = startswith(l, "n-installed-jobs="))) {
+ uint32_t n;
+
+ if (safe_atou32(val, &n) < 0)
+ log_notice("Failed to parse installed jobs counter '%s', ignoring.", val);
+ else
+ m->n_installed_jobs += n;
+
+ } else if ((val = startswith(l, "n-failed-jobs="))) {
+ uint32_t n;
+
+ if (safe_atou32(val, &n) < 0)
+ log_notice("Failed to parse failed jobs counter '%s', ignoring.", val);
+ else
+ m->n_failed_jobs += n;
+
+ } else if ((val = startswith(l, "taint-usr="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse taint /usr flag '%s', ignoring.", val);
+ else
+ m->taint_usr = m->taint_usr || b;
+
+ } else if ((val = startswith(l, "ready-sent="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse ready-sent flag '%s', ignoring.", val);
+ else
+ m->ready_sent = m->ready_sent || b;
+
+ } else if ((val = startswith(l, "taint-logged="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse taint-logged flag '%s', ignoring.", val);
+ else
+ m->taint_logged = m->taint_logged || b;
+
+ } else if ((val = startswith(l, "service-watchdogs="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val);
+ else
+ m->service_watchdogs = b;
+
+ } else if ((val = startswith(l, "show-status-overridden="))) {
+ ShowStatus s;
+
+ s = show_status_from_string(val);
+ if (s < 0)
+ log_notice("Failed to parse show-status-overridden flag '%s', ignoring.", val);
+ else
+ manager_override_show_status(m, s, "deserialize");
+
+ } else if ((val = startswith(l, "log-level-override="))) {
+ int level;
+
+ level = log_level_from_string(val);
+ if (level < 0)
+ log_notice("Failed to parse log-level-override value '%s', ignoring.", val);
+ else
+ manager_override_log_level(m, level);
+
+ } else if ((val = startswith(l, "log-target-override="))) {
+ LogTarget target;
+
+ target = log_target_from_string(val);
+ if (target < 0)
+ log_notice("Failed to parse log-target-override value '%s', ignoring.", val);
+ else
+ manager_override_log_target(m, target);
+
+ } else if ((val = startswith(l, "runtime-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse runtime-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_RUNTIME, t);
+
+ } else if ((val = startswith(l, "reboot-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse reboot-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_REBOOT, t);
+
+ } else if ((val = startswith(l, "kexec-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse kexec-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_KEXEC, t);
+
+ } else if ((val = startswith(l, "pretimeout-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse pretimeout-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_PRETIMEOUT, t);
+
+ } else if ((val = startswith(l, "pretimeout-watchdog-governor-overridden="))) {
+ r = free_and_strdup(&m->watchdog_pretimeout_governor_overridden, val);
+ if (r < 0)
+ return r;
+
+ } else if (startswith(l, "env=")) {
+ r = deserialize_environment(l + 4, &m->client_environment);
+ if (r < 0)
+ log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l);
+
+ } else if ((val = startswith(l, "notify-fd="))) {
+ int fd;
+
+ if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_notice("Failed to parse notify fd, ignoring: \"%s\"", val);
+ else {
+ m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+ safe_close(m->notify_fd);
+ m->notify_fd = fdset_remove(fds, fd);
+ }
+
+ } else if ((val = startswith(l, "notify-socket="))) {
+ r = free_and_strdup(&m->notify_socket, val);
+ if (r < 0)
+ return r;
+
+ } else if ((val = startswith(l, "cgroups-agent-fd="))) {
+ int fd;
+
+ if (safe_atoi(val, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_notice("Failed to parse cgroups agent fd, ignoring.: %s", val);
+ else {
+ m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+ safe_close(m->cgroups_agent_fd);
+ m->cgroups_agent_fd = fdset_remove(fds, fd);
+ }
+
+ } else if ((val = startswith(l, "user-lookup="))) {
+ int fd0, fd1;
+
+ if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1))
+ log_notice("Failed to parse user lookup fd, ignoring: %s", val);
+ else {
+ m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+ safe_close_pair(m->user_lookup_fds);
+ m->user_lookup_fds[0] = fdset_remove(fds, fd0);
+ m->user_lookup_fds[1] = fdset_remove(fds, fd1);
+ }
+
+ } else if ((val = startswith(l, "dynamic-user=")))
+ dynamic_user_deserialize_one(m, val, fds);
+ else if ((val = startswith(l, "destroy-ipc-uid=")))
+ manager_deserialize_uid_refs_one(m, val);
+ else if ((val = startswith(l, "destroy-ipc-gid=")))
+ manager_deserialize_gid_refs_one(m, val);
+ else if ((val = startswith(l, "exec-runtime=")))
+ (void) exec_runtime_deserialize_one(m, val, fds);
+ else if ((val = startswith(l, "subscribed="))) {
+
+ if (strv_extend(&m->deserialized_subscribed, val) < 0)
+ return -ENOMEM;
+ } else if ((val = startswith(l, "varlink-server-socket-address="))) {
+ if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+
+ r = manager_setup_varlink_server(m, &s);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to setup varlink server, ignoring: %m");
+ continue;
+ }
+
+ r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to attach varlink connection to event loop, ignoring: %m");
+ continue;
+ }
+
+ m->varlink_server = TAKE_PTR(s);
+ deserialize_varlink_sockets = true;
+ }
+
+ /* To void unnecessary deserialization (i.e. during reload vs. reexec) we only deserialize
+ * the FDs if we had to create a new m->varlink_server. The deserialize_varlink_sockets flag
+ * is initialized outside of the loop, is flipped after the VarlinkServer is setup, and
+ * remains set until all serialized contents are handled. */
+ if (deserialize_varlink_sockets)
+ (void) varlink_server_deserialize_one(m->varlink_server, val, fds);
+ } else if ((val = startswith(l, "dump-ratelimit="))) {
+ usec_t begin, interval;
+ unsigned num, burst;
+
+ if (sscanf(val, USEC_FMT " " USEC_FMT " %u %u", &begin, &interval, &num, &burst) != 4)
+ log_notice("Failed to parse dump ratelimit, ignoring: %s", val);
+ else {
+ /* If we changed the values across versions, flush the counter */
+ if (interval != m->dump_ratelimit.interval || burst != m->dump_ratelimit.burst)
+ m->dump_ratelimit.num = 0;
+ else
+ m->dump_ratelimit.num = num;
+ m->dump_ratelimit.begin = begin;
+ }
+
+ } else {
+ ManagerTimestamp q;
+
+ for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ val = startswith(l, manager_timestamp_to_string(q));
+ if (!val)
+ continue;
+
+ val = startswith(val, "-timestamp=");
+ if (val)
+ break;
+ }
+
+ if (q < _MANAGER_TIMESTAMP_MAX) /* found it */
+ (void) deserialize_dual_timestamp(val, m->timestamps + q);
+ else if (!STARTSWITH_SET(l, "kdbus-fd=", "honor-device-enumeration=")) /* ignore deprecated values */
+ log_notice("Unknown serialization item '%s', ignoring.", l);
+ }
+ }
+
+ return manager_deserialize_units(m, f, fds);
+}
diff --git a/src/core/manager-serialize.h b/src/core/manager-serialize.h
new file mode 100644
index 0000000..c52261e
--- /dev/null
+++ b/src/core/manager-serialize.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "manager.h"
+#include "fdset.h"
+
+#define DESTROY_IPC_FLAG (UINT32_C(1) << 31)
+
+int manager_open_serialization(Manager *m, FILE **ret_f);
+int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root);
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds);
diff --git a/src/core/manager.c b/src/core/manager.c
new file mode 100644
index 0000000..7446444
--- /dev/null
+++ b/src/core/manager.c
@@ -0,0 +1,4676 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kd.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <sys/ioctl.h>
+#include <sys/reboot.h>
+#include <sys/timerfd.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#if HAVE_AUDIT
+#include <libaudit.h>
+#endif
+
+#include "sd-daemon.h"
+#include "sd-messages.h"
+#include "sd-path.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "boot-timestamps.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "clean-ipc.h"
+#include "clock-util.h"
+#include "core-varlink.h"
+#include "creds-util.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "def.h"
+#include "dirent-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "event-util.h"
+#include "exec-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "generator-setup.h"
+#include "hashmap.h"
+#include "inotify-util.h"
+#include "install.h"
+#include "io-util.h"
+#include "label.h"
+#include "load-fragment.h"
+#include "locale-setup.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "memory-util.h"
+#include "mkdir-label.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "ratelimit.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "sysctl-util.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "transaction.h"
+#include "uid-range.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "user-util.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#define NOTIFY_RCVBUF_SIZE (8*1024*1024)
+#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024)
+
+/* Initial delay and the interval for printing status messages about running jobs */
+#define JOBS_IN_PROGRESS_WAIT_USEC (2*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_QUIET_WAIT_USEC (25*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3)
+#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3
+
+/* If there are more than 1K bus messages queue across our API and direct buses, then let's not add more on top until
+ * the queue gets more empty. */
+#define MANAGER_BUS_BUSY_THRESHOLD 1024LU
+
+/* How many units and jobs to process of the bus queue before returning to the event loop. */
+#define MANAGER_BUS_MESSAGE_BUDGET 100U
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
+static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata);
+static int manager_run_environment_generators(Manager *m);
+static int manager_run_generators(Manager *m);
+static void manager_vacuum(Manager *m);
+
+static usec_t manager_watch_jobs_next_time(Manager *m) {
+ usec_t timeout;
+
+ if (MANAGER_IS_USER(m))
+ /* Let the user manager without a timeout show status quickly, so the system manager can make
+ * use of it, if it wants to. */
+ timeout = JOBS_IN_PROGRESS_WAIT_USEC * 2 / 3;
+ else if (show_status_on(m->show_status))
+ /* When status is on, just use the usual timeout. */
+ timeout = JOBS_IN_PROGRESS_WAIT_USEC;
+ else
+ timeout = JOBS_IN_PROGRESS_QUIET_WAIT_USEC;
+
+ return usec_add(now(CLOCK_MONOTONIC), timeout);
+}
+
+static void manager_watch_jobs_in_progress(Manager *m) {
+ usec_t next;
+ int r;
+
+ assert(m);
+
+ /* We do not want to show the cylon animation if the user
+ * needs to confirm service executions otherwise confirmation
+ * messages will be screwed by the cylon animation. */
+ if (!manager_is_confirm_spawn_disabled(m))
+ return;
+
+ if (m->jobs_in_progress_event_source)
+ return;
+
+ next = manager_watch_jobs_next_time(m);
+ r = sd_event_add_time(
+ m->event,
+ &m->jobs_in_progress_event_source,
+ CLOCK_MONOTONIC,
+ next, 0,
+ manager_dispatch_jobs_in_progress, m);
+ if (r < 0)
+ return;
+
+ (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress");
+}
+
+static void manager_flip_auto_status(Manager *m, bool enable, const char *reason) {
+ assert(m);
+
+ if (enable) {
+ if (m->show_status == SHOW_STATUS_AUTO)
+ manager_set_show_status(m, SHOW_STATUS_TEMPORARY, reason);
+ } else {
+ if (m->show_status == SHOW_STATUS_TEMPORARY)
+ manager_set_show_status(m, SHOW_STATUS_AUTO, reason);
+ }
+}
+
+static void manager_print_jobs_in_progress(Manager *m) {
+ Job *j;
+ unsigned counter = 0, print_nr;
+ char cylon[6 + CYLON_BUFFER_EXTRA + 1];
+ unsigned cylon_pos;
+ uint64_t timeout = 0;
+
+ assert(m);
+ assert(m->n_running_jobs > 0);
+
+ manager_flip_auto_status(m, true, "delay");
+
+ print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs;
+
+ HASHMAP_FOREACH(j, m->jobs)
+ if (j->state == JOB_RUNNING && counter++ == print_nr)
+ break;
+
+ /* m->n_running_jobs must be consistent with the contents of m->jobs,
+ * so the above loop must have succeeded in finding j. */
+ assert(counter == print_nr + 1);
+ assert(j);
+
+ cylon_pos = m->jobs_in_progress_iteration % 14;
+ if (cylon_pos >= 8)
+ cylon_pos = 14 - cylon_pos;
+ draw_cylon(cylon, sizeof(cylon), 6, cylon_pos);
+
+ m->jobs_in_progress_iteration++;
+
+ char job_of_n[STRLEN("( of ) ") + DECIMAL_STR_MAX(unsigned)*2] = "";
+ if (m->n_running_jobs > 1)
+ xsprintf(job_of_n, "(%u of %u) ", counter, m->n_running_jobs);
+
+ (void) job_get_timeout(j, &timeout);
+
+ /* We want to use enough information for the user to identify previous lines talking about the same
+ * unit, but keep the message as short as possible. So if 'Starting foo.service' or 'Starting
+ * foo.service - Description' were used, 'foo.service' is enough here. On the other hand, if we used
+ * 'Starting Description' before, then we shall also use 'Description' here. So we pass NULL as the
+ * second argument to unit_status_string(). */
+ const char *ident = unit_status_string(j->unit, NULL);
+
+ const char *time = FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC);
+ const char *limit = timeout > 0 ? FORMAT_TIMESPAN(timeout - j->begin_usec, 1*USEC_PER_SEC) : "no limit";
+
+ if (m->status_unit_format == STATUS_UNIT_FORMAT_DESCRIPTION)
+ /* When using 'Description', we effectively don't have enough space to show the nested status
+ * without ellipsization, so let's not even try. */
+ manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+ "%sA %s job is running for %s (%s / %s)",
+ job_of_n,
+ job_type_to_string(j->type),
+ ident,
+ time, limit);
+ else {
+ const char *status_text = unit_status_text(j->unit);
+
+ manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+ "%sJob %s/%s running (%s / %s)%s%s",
+ job_of_n,
+ ident,
+ job_type_to_string(j->type),
+ time, limit,
+ status_text ? ": " : "",
+ strempty(status_text));
+ }
+
+ sd_notifyf(false,
+ "STATUS=%sUser job %s/%s running (%s / %s)...",
+ job_of_n,
+ ident,
+ job_type_to_string(j->type),
+ time, limit);
+ m->status_ready = false;
+}
+
+static int have_ask_password(void) {
+ _cleanup_closedir_ DIR *dir = NULL;
+
+ dir = opendir("/run/systemd/ask-password");
+ if (!dir) {
+ if (errno == ENOENT)
+ return false;
+ else
+ return -errno;
+ }
+
+ FOREACH_DIRENT_ALL(de, dir, return -errno)
+ if (startswith(de->d_name, "ask."))
+ return true;
+ return false;
+}
+
+static int manager_dispatch_ask_password_fd(sd_event_source *source,
+ int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ (void) flush_fd(fd);
+
+ m->have_ask_password = have_ask_password();
+ if (m->have_ask_password < 0)
+ /* Log error but continue. Negative have_ask_password
+ * is treated as unknown status. */
+ log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m");
+
+ return 0;
+}
+
+static void manager_close_ask_password(Manager *m) {
+ assert(m);
+
+ m->ask_password_event_source = sd_event_source_disable_unref(m->ask_password_event_source);
+ m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd);
+ m->have_ask_password = -EINVAL;
+}
+
+static int manager_check_ask_password(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->ask_password_event_source) {
+ assert(m->ask_password_inotify_fd < 0);
+
+ (void) mkdir_p_label("/run/systemd/ask-password", 0755);
+
+ m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (m->ask_password_inotify_fd < 0)
+ return log_error_errno(errno, "Failed to create inotify object: %m");
+
+ r = inotify_add_watch_and_warn(m->ask_password_inotify_fd,
+ "/run/systemd/ask-password",
+ IN_CREATE|IN_DELETE|IN_MOVE);
+ if (r < 0) {
+ manager_close_ask_password(m);
+ return r;
+ }
+
+ r = sd_event_add_io(m->event, &m->ask_password_event_source,
+ m->ask_password_inotify_fd, EPOLLIN,
+ manager_dispatch_ask_password_fd, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add event source for /run/systemd/ask-password: %m");
+ manager_close_ask_password(m);
+ return r;
+ }
+
+ (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password");
+
+ /* Queries might have been added meanwhile... */
+ manager_dispatch_ask_password_fd(m->ask_password_event_source,
+ m->ask_password_inotify_fd, EPOLLIN, m);
+ }
+
+ return m->have_ask_password;
+}
+
+static int manager_watch_idle_pipe(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (m->idle_pipe_event_source)
+ return 0;
+
+ if (m->idle_pipe[2] < 0)
+ return 0;
+
+ r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch idle pipe: %m");
+
+ (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe");
+
+ return 0;
+}
+
+static void manager_close_idle_pipe(Manager *m) {
+ assert(m);
+
+ m->idle_pipe_event_source = sd_event_source_disable_unref(m->idle_pipe_event_source);
+
+ safe_close_pair(m->idle_pipe);
+ safe_close_pair(m->idle_pipe + 2);
+}
+
+static int manager_setup_time_change(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ m->time_change_event_source = sd_event_source_disable_unref(m->time_change_event_source);
+
+ r = event_add_time_change(m->event, &m->time_change_event_source, manager_dispatch_time_change_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create time change event source: %m");
+
+ /* Schedule this slightly earlier than the .timer event sources */
+ r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of time change event sources: %m");
+
+ log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd.");
+
+ return 0;
+}
+
+static int manager_read_timezone_stat(Manager *m) {
+ struct stat st;
+ bool changed;
+
+ assert(m);
+
+ /* Read the current stat() data of /etc/localtime so that we detect changes */
+ if (lstat("/etc/localtime", &st) < 0) {
+ log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m");
+ changed = m->etc_localtime_accessible;
+ m->etc_localtime_accessible = false;
+ } else {
+ usec_t k;
+
+ k = timespec_load(&st.st_mtim);
+ changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime;
+
+ m->etc_localtime_mtime = k;
+ m->etc_localtime_accessible = true;
+ }
+
+ return changed;
+}
+
+static int manager_setup_timezone_change(Manager *m) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even
+ * though another link might be kept), renames, and file close operations after writing. Note we don't bother
+ * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count
+ * went to zero and all fds to it are closed.
+ *
+ * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases
+ * correctly.
+ *
+ * Note that we create the new event source first here, before releasing the old one. This should optimize
+ * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */
+
+ r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime",
+ IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m);
+ if (r == -ENOENT) {
+ /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by
+ * O_CREATE or by rename() */
+
+ log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead.");
+ r = sd_event_add_inotify(m->event, &new_event, "/etc",
+ IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m);
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to create timezone change event source: %m");
+
+ /* Schedule this slightly earlier than the .timer event sources */
+ r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of timezone change event sources: %m");
+
+ sd_event_source_unref(m->timezone_change_event_source);
+ m->timezone_change_event_source = TAKE_PTR(new_event);
+
+ return 0;
+}
+
+static int enable_special_signals(Manager *m) {
+ _cleanup_close_ int fd = -1;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ /* Enable that we get SIGINT on control-alt-del. In containers
+ * this will fail with EPERM (older) or EINVAL (newer), so
+ * ignore that. */
+ if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL))
+ log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m");
+
+ fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0) {
+ /* Support systems without virtual console */
+ if (fd != -ENOENT)
+ log_warning_errno(errno, "Failed to open /dev/tty0: %m");
+ } else {
+ /* Enable that we get SIGWINCH on kbrequest */
+ if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0)
+ log_warning_errno(errno, "Failed to enable kbrequest handling: %m");
+ }
+
+ return 0;
+}
+
+#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1)
+
+static int manager_setup_signals(Manager *m) {
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+ };
+ sigset_t mask;
+ int r;
+
+ assert(m);
+
+ assert_se(sigaction(SIGCHLD, &sa, NULL) == 0);
+
+ /* We make liberal use of realtime signals here. On
+ * Linux/glibc we have 30 of them (with the exception of Linux
+ * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30
+ * (aka SIGRTMAX). */
+
+ assert_se(sigemptyset(&mask) == 0);
+ sigset_add_many(&mask,
+ SIGCHLD, /* Child died */
+ SIGTERM, /* Reexecute daemon */
+ SIGHUP, /* Reload configuration */
+ SIGUSR1, /* systemd: reconnect to D-Bus */
+ SIGUSR2, /* systemd: dump status */
+ SIGINT, /* Kernel sends us this on control-alt-del */
+ SIGWINCH, /* Kernel sends us this on kbrequest (alt-arrowup) */
+ SIGPWR, /* Some kernel drivers and upsd send us this on power failure */
+
+ SIGRTMIN+0, /* systemd: start default.target */
+ SIGRTMIN+1, /* systemd: isolate rescue.target */
+ SIGRTMIN+2, /* systemd: isolate emergency.target */
+ SIGRTMIN+3, /* systemd: start halt.target */
+ SIGRTMIN+4, /* systemd: start poweroff.target */
+ SIGRTMIN+5, /* systemd: start reboot.target */
+ SIGRTMIN+6, /* systemd: start kexec.target */
+
+ /* ... space for more special targets ... */
+
+ SIGRTMIN+13, /* systemd: Immediate halt */
+ SIGRTMIN+14, /* systemd: Immediate poweroff */
+ SIGRTMIN+15, /* systemd: Immediate reboot */
+ SIGRTMIN+16, /* systemd: Immediate kexec */
+
+ /* ... space for more immediate system state changes ... */
+
+ SIGRTMIN+20, /* systemd: enable status messages */
+ SIGRTMIN+21, /* systemd: disable status messages */
+ SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */
+ SIGRTMIN+23, /* systemd: set log level to LOG_INFO */
+ SIGRTMIN+24, /* systemd: Immediate exit (--user only) */
+ SIGRTMIN+25, /* systemd: reexecute manager */
+
+ /* Apparently Linux on hppa had fewer RT signals until v3.18,
+ * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered,
+ * see commit v3.17-7614-g1f25df2eff.
+ *
+ * We cannot unconditionally make use of those signals here,
+ * so let's use a runtime check. Since these commands are
+ * accessible by different means and only really a safety
+ * net, the missing functionality on hppa shouldn't matter.
+ */
+
+ RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */
+
+ /* ... one free signal here SIGRTMIN+30 ... */
+ -1);
+ assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+
+ m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
+ if (m->signal_fd < 0)
+ return -errno;
+
+ r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->signal_event_source, "manager-signal");
+
+ /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the
+ * notify processing can still figure out to which process/service a message belongs, before we reap the
+ * process. Also, process this before handling cgroup notifications, so that we always collect child exit
+ * status information before detecting that there's no process in a cgroup. */
+ r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(m))
+ return enable_special_signals(m);
+
+ return 0;
+}
+
+static char** sanitize_environment(char **l) {
+
+ /* Let's remove some environment variables that we need ourselves to communicate with our clients */
+ strv_env_unset_many(
+ l,
+ "CACHE_DIRECTORY",
+ "CONFIGURATION_DIRECTORY",
+ "CREDENTIALS_DIRECTORY",
+ "EXIT_CODE",
+ "EXIT_STATUS",
+ "INVOCATION_ID",
+ "JOURNAL_STREAM",
+ "LISTEN_FDNAMES",
+ "LISTEN_FDS",
+ "LISTEN_PID",
+ "LOGS_DIRECTORY",
+ "MAINPID",
+ "MANAGERPID",
+ "NOTIFY_SOCKET",
+ "PIDFILE",
+ "REMOTE_ADDR",
+ "REMOTE_PORT",
+ "RUNTIME_DIRECTORY",
+ "SERVICE_RESULT",
+ "STATE_DIRECTORY",
+ "WATCHDOG_PID",
+ "WATCHDOG_USEC",
+ NULL);
+
+ /* Let's order the environment alphabetically, just to make it pretty */
+ return strv_sort(l);
+}
+
+int manager_default_environment(Manager *m) {
+ int r;
+
+ assert(m);
+
+ m->transient_environment = strv_free(m->transient_environment);
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* The system manager always starts with a clean
+ * environment for its children. It does not import
+ * the kernel's or the parents' exported variables.
+ *
+ * The initial passed environment is untouched to keep
+ * /proc/self/environ valid; it is used for tagging
+ * the init process inside containers. */
+ m->transient_environment = strv_new("PATH=" DEFAULT_PATH);
+ if (!m->transient_environment)
+ return log_oom();
+
+ /* Import locale variables LC_*= from configuration */
+ (void) locale_setup(&m->transient_environment);
+ } else {
+ /* The user manager passes its own environment along to its children, except for $PATH. */
+ m->transient_environment = strv_copy(environ);
+ if (!m->transient_environment)
+ return log_oom();
+
+ r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH);
+ if (r < 0)
+ return log_oom();
+ }
+
+ sanitize_environment(m->transient_environment);
+
+ return 0;
+}
+
+static int manager_setup_prefix(Manager *m) {
+ struct table_entry {
+ uint64_t type;
+ const char *suffix;
+ };
+
+ static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL },
+ [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL },
+ };
+
+ static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_CONFIGURATION, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_CONFIGURATION, "log" },
+ [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL },
+ };
+
+ assert(m);
+
+ const struct table_entry *p = MANAGER_IS_SYSTEM(m) ? paths_system : paths_user;
+ int r;
+
+ for (ExecDirectoryType i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) {
+ r = sd_path_lookup(p[i].type, p[i].suffix, &m->prefix[i]);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to lookup %s path: %m",
+ exec_directory_type_to_string(i));
+ }
+
+ return 0;
+}
+
+static void manager_free_unit_name_maps(Manager *m) {
+ m->unit_id_map = hashmap_free(m->unit_id_map);
+ m->unit_name_map = hashmap_free(m->unit_name_map);
+ m->unit_path_cache = set_free(m->unit_path_cache);
+ m->unit_cache_timestamp_hash = 0;
+}
+
+static int manager_setup_run_queue(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->run_queue_event_source);
+
+ r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue");
+
+ return 0;
+}
+
+static int manager_setup_sigchld_event_source(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->sigchld_event_source);
+
+ r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld");
+
+ return 0;
+}
+
+static int manager_find_credentials_dirs(Manager *m) {
+ const char *e;
+ int r;
+
+ assert(m);
+
+ r = get_credentials_dir(&e);
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_debug_errno(r, "Failed to determine credentials directory, ignoring: %m");
+ } else {
+ m->received_credentials_directory = strdup(e);
+ if (!m->received_credentials_directory)
+ return -ENOMEM;
+ }
+
+ r = get_encrypted_credentials_dir(&e);
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_debug_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+ } else {
+ m->received_encrypted_credentials_directory = strdup(e);
+ if (!m->received_encrypted_credentials_directory)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void manager_set_switching_root(Manager *m, bool switching_root) {
+ m->switching_root = MANAGER_IS_SYSTEM(m) && switching_root;
+}
+
+int manager_new(LookupScope scope, ManagerTestRunFlags test_run_flags, Manager **_m) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ assert(_m);
+ assert(IN_SET(scope, LOOKUP_SCOPE_SYSTEM, LOOKUP_SCOPE_USER));
+
+ m = new(Manager, 1);
+ if (!m)
+ return -ENOMEM;
+
+ *m = (Manager) {
+ .unit_file_scope = scope,
+ .objective = _MANAGER_OBJECTIVE_INVALID,
+
+ .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT,
+
+ .default_timer_accuracy_usec = USEC_PER_MINUTE,
+ .default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT,
+ .default_tasks_accounting = true,
+ .default_tasks_max = TASKS_MAX_UNSET,
+ .default_timeout_start_usec = DEFAULT_TIMEOUT_USEC,
+ .default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC,
+ .default_restart_usec = DEFAULT_RESTART_USEC,
+ .default_device_timeout_usec = DEFAULT_TIMEOUT_USEC,
+
+ .original_log_level = -1,
+ .original_log_target = _LOG_TARGET_INVALID,
+
+ .watchdog_overridden[WATCHDOG_RUNTIME] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_REBOOT] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_KEXEC] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_PRETIMEOUT] = USEC_INFINITY,
+
+ .show_status_overridden = _SHOW_STATUS_INVALID,
+
+ .notify_fd = -1,
+ .cgroups_agent_fd = -1,
+ .signal_fd = -1,
+ .user_lookup_fds = { -1, -1 },
+ .private_listen_fd = -1,
+ .dev_autofs_fd = -1,
+ .cgroup_inotify_fd = -1,
+ .pin_cgroupfs_fd = -1,
+ .ask_password_inotify_fd = -1,
+ .idle_pipe = { -1, -1, -1, -1},
+
+ /* start as id #1, so that we can leave #0 around as "null-like" value */
+ .current_job_id = 1,
+
+ .have_ask_password = -EINVAL, /* we don't know */
+ .first_boot = -1,
+ .test_run_flags = test_run_flags,
+
+ .default_oom_policy = OOM_STOP,
+
+ .dump_ratelimit = {
+ .interval = 10 * USEC_PER_MINUTE,
+ .burst = 10,
+ },
+ };
+
+#if ENABLE_EFI
+ if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0)
+ boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE,
+ m->timestamps + MANAGER_TIMESTAMP_FIRMWARE,
+ m->timestamps + MANAGER_TIMESTAMP_LOADER);
+#endif
+
+ /* Prepare log fields we can use for structured logging */
+ if (MANAGER_IS_SYSTEM(m)) {
+ m->unit_log_field = "UNIT=";
+ m->unit_log_format_string = "UNIT=%s";
+
+ m->invocation_log_field = "INVOCATION_ID=";
+ m->invocation_log_format_string = "INVOCATION_ID=%s";
+ } else {
+ m->unit_log_field = "USER_UNIT=";
+ m->unit_log_format_string = "USER_UNIT=%s";
+
+ m->invocation_log_field = "USER_INVOCATION_ID=";
+ m->invocation_log_format_string = "USER_INVOCATION_ID=%s";
+ }
+
+ /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */
+ m->ctrl_alt_del_ratelimit = (RateLimit) { .interval = 2 * USEC_PER_SEC, .burst = 7 };
+
+ r = manager_default_environment(m);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->units, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = prioq_ensure_allocated(&m->run_queue, compare_job_priority);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_prefix(m);
+ if (r < 0)
+ return r;
+
+ r = manager_find_credentials_dirs(m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_default(&m->event);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_run_queue(m);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(test_run_flags, MANAGER_TEST_RUN_MINIMAL)) {
+ m->cgroup_root = strdup("");
+ if (!m->cgroup_root)
+ return -ENOMEM;
+ } else {
+ r = manager_setup_signals(m);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_cgroup(m);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_time_change(m);
+ if (r < 0)
+ return r;
+
+ r = manager_read_timezone_stat(m);
+ if (r < 0)
+ return r;
+
+ (void) manager_setup_timezone_change(m);
+
+ r = manager_setup_sigchld_event_source(m);
+ if (r < 0)
+ return r;
+
+#if HAVE_LIBBPF
+ if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) {
+ r = lsm_bpf_setup(m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m");
+ }
+#endif
+ }
+
+ if (test_run_flags == 0) {
+ if (MANAGER_IS_SYSTEM(m))
+ r = mkdir_label("/run/systemd/units", 0755);
+ else {
+ _cleanup_free_ char *units_path = NULL;
+ r = xdg_user_runtime_dir(&units_path, "/systemd/units");
+ if (r < 0)
+ return r;
+ r = mkdir_p_label(units_path, 0755);
+ }
+
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ m->taint_usr =
+ !in_initrd() &&
+ dir_is_empty("/usr", /* ignore_hidden_or_backup= */ false) > 0;
+
+ /* Note that we do not set up the notify fd here. We do that after deserialization,
+ * since they might have gotten serialized across the reexec. */
+
+ *_m = TAKE_PTR(m);
+
+ return 0;
+}
+
+static int manager_setup_notify(Manager *m) {
+ int r;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ if (m->notify_fd < 0) {
+ _cleanup_close_ int fd = -1;
+ union sockaddr_union sa;
+ socklen_t sa_len;
+
+ /* First free all secondary fields */
+ m->notify_socket = mfree(m->notify_socket);
+ m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+ fd_increase_rxbuf(fd, NOTIFY_RCVBUF_SIZE);
+
+ m->notify_socket = path_join(m->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/notify");
+ if (!m->notify_socket)
+ return log_oom();
+
+ r = sockaddr_un_set_path(&sa.un, m->notify_socket);
+ if (r < 0)
+ return log_error_errno(r, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.",
+ m->notify_socket);
+ sa_len = r;
+
+ (void) mkdir_parents_label(m->notify_socket, 0755);
+ (void) sockaddr_un_unlink(&sa.un);
+
+ r = mac_selinux_bind(fd, &sa.sa, sa_len);
+ if (r < 0)
+ return log_error_errno(r, "bind(%s) failed: %m", m->notify_socket);
+
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ return log_error_errno(r, "SO_PASSCRED failed: %m");
+
+ m->notify_fd = TAKE_FD(fd);
+
+ log_debug("Using notification socket %s", m->notify_socket);
+ }
+
+ if (!m->notify_event_source) {
+ r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+ /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which
+ * service an exit message belongs. */
+ r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of notify event source: %m");
+
+ (void) sd_event_source_set_description(m->notify_event_source, "manager-notify");
+ }
+
+ return 0;
+}
+
+static int manager_setup_cgroups_agent(Manager *m) {
+
+ static const union sockaddr_union sa = {
+ .un.sun_family = AF_UNIX,
+ .un.sun_path = "/run/systemd/cgroups-agent",
+ };
+ int r;
+
+ /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering
+ * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and
+ * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on
+ * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number
+ * of D-Bus connections may be queued until the kernel will start dropping further incoming connections,
+ * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX
+ * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and
+ * we thus won't lose messages.
+ *
+ * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen
+ * to it. The system instance hence listens on this special socket, but the user instances listen on the system
+ * bus for these messages. */
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m");
+ if (r > 0) /* We don't need this anymore on the unified hierarchy */
+ return 0;
+
+ if (m->cgroups_agent_fd < 0) {
+ _cleanup_close_ int fd = -1;
+
+ /* First free all secondary fields */
+ m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m");
+
+ fd_increase_rxbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE);
+
+ (void) sockaddr_un_unlink(&sa.un);
+
+ /* Only allow root to connect to this socket */
+ RUN_WITH_UMASK(0077)
+ r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+ if (r < 0)
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+
+ m->cgroups_agent_fd = TAKE_FD(fd);
+ }
+
+ if (!m->cgroups_agent_event_source) {
+ r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate cgroups agent event source: %m");
+
+ /* Process cgroups notifications early. Note that when the agent notification is received
+ * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than
+ * that. Also see handling of cgroup inotify for the unified cgroup stuff. */
+ r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent");
+ }
+
+ return 0;
+}
+
+static int manager_setup_user_lookup_fd(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID
+ * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation,
+ * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked
+ * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects
+ * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes,
+ * hence we establish this communication channel so that forked off processes can pass their UID/GID
+ * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple
+ * datagram, along with their unit name, so that we can share one communication socket pair among all units for
+ * this purpose.
+ *
+ * You might wonder why we need a communication channel for this that is independent of the usual notification
+ * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET
+ * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user
+ * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available.
+ *
+ * Note that this function is called under two circumstances: when we first initialize (in which case we
+ * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload
+ * (in which case the socket pair already exists but we still need to allocate the event source for it). */
+
+ if (m->user_lookup_fds[0] < 0) {
+
+ /* Free all secondary fields */
+ safe_close_pair(m->user_lookup_fds);
+ m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0)
+ return log_error_errno(errno, "Failed to allocate user lookup socket: %m");
+
+ (void) fd_increase_rxbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE);
+ }
+
+ if (!m->user_lookup_event_source) {
+ r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to allocate user lookup event source: %m");
+
+ /* Process even earlier than the notify event source, so that we always know first about valid UID/GID
+ * resolutions */
+ r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to set priority of user lookup event source: %m");
+
+ (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup");
+ }
+
+ return 0;
+}
+
+static unsigned manager_dispatch_cleanup_queue(Manager *m) {
+ Unit *u;
+ unsigned n = 0;
+
+ assert(m);
+
+ while ((u = m->cleanup_queue)) {
+ assert(u->in_cleanup_queue);
+
+ unit_free(u);
+ n++;
+ }
+
+ return n;
+}
+
+enum {
+ GC_OFFSET_IN_PATH, /* This one is on the path we were traveling */
+ GC_OFFSET_UNSURE, /* No clue */
+ GC_OFFSET_GOOD, /* We still need this unit */
+ GC_OFFSET_BAD, /* We don't need this unit anymore */
+ _GC_OFFSET_MAX
+};
+
+static void unit_gc_mark_good(Unit *u, unsigned gc_marker) {
+ Unit *other;
+
+ u->gc_marker = gc_marker + GC_OFFSET_GOOD;
+
+ /* Recursively mark referenced units as GOOD as well */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCES)
+ if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE)
+ unit_gc_mark_good(other, gc_marker);
+}
+
+static void unit_gc_sweep(Unit *u, unsigned gc_marker) {
+ Unit *other;
+ bool is_bad;
+
+ assert(u);
+
+ if (IN_SET(u->gc_marker - gc_marker,
+ GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH))
+ return;
+
+ if (u->in_cleanup_queue)
+ goto bad;
+
+ if (!unit_may_gc(u))
+ goto good;
+
+ u->gc_marker = gc_marker + GC_OFFSET_IN_PATH;
+
+ is_bad = true;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCED_BY) {
+ unit_gc_sweep(other, gc_marker);
+
+ if (other->gc_marker == gc_marker + GC_OFFSET_GOOD)
+ goto good;
+
+ if (other->gc_marker != gc_marker + GC_OFFSET_BAD)
+ is_bad = false;
+ }
+
+ LIST_FOREACH(refs_by_target, ref, u->refs_by_target) {
+ unit_gc_sweep(ref->source, gc_marker);
+
+ if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD)
+ goto good;
+
+ if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD)
+ is_bad = false;
+ }
+
+ if (is_bad)
+ goto bad;
+
+ /* We were unable to find anything out about this entry, so
+ * let's investigate it later */
+ u->gc_marker = gc_marker + GC_OFFSET_UNSURE;
+ unit_add_to_gc_queue(u);
+ return;
+
+bad:
+ /* We definitely know that this one is not useful anymore, so
+ * let's mark it for deletion */
+ u->gc_marker = gc_marker + GC_OFFSET_BAD;
+ unit_add_to_cleanup_queue(u);
+ return;
+
+good:
+ unit_gc_mark_good(u, gc_marker);
+}
+
+static unsigned manager_dispatch_gc_unit_queue(Manager *m) {
+ unsigned n = 0, gc_marker;
+ Unit *u;
+
+ assert(m);
+
+ /* log_debug("Running GC..."); */
+
+ m->gc_marker += _GC_OFFSET_MAX;
+ if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX)
+ m->gc_marker = 1;
+
+ gc_marker = m->gc_marker;
+
+ while ((u = m->gc_unit_queue)) {
+ assert(u->in_gc_queue);
+
+ unit_gc_sweep(u, gc_marker);
+
+ LIST_REMOVE(gc_queue, m->gc_unit_queue, u);
+ u->in_gc_queue = false;
+
+ n++;
+
+ if (IN_SET(u->gc_marker - gc_marker,
+ GC_OFFSET_BAD, GC_OFFSET_UNSURE)) {
+ if (u->id)
+ log_unit_debug(u, "Collecting.");
+ u->gc_marker = gc_marker + GC_OFFSET_BAD;
+ unit_add_to_cleanup_queue(u);
+ }
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_gc_job_queue(Manager *m) {
+ unsigned n = 0;
+ Job *j;
+
+ assert(m);
+
+ while ((j = m->gc_job_queue)) {
+ assert(j->in_gc_queue);
+
+ LIST_REMOVE(gc_queue, m->gc_job_queue, j);
+ j->in_gc_queue = false;
+
+ n++;
+
+ if (!job_may_gc(j))
+ continue;
+
+ log_unit_debug(j->unit, "Collecting job.");
+ (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false);
+ }
+
+ return n;
+}
+
+static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) {
+ Unit *u = userdata;
+
+ assert(u);
+ assert(s == u->auto_start_stop_event_source);
+
+ u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source);
+
+ /* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */
+ unit_submit_to_stop_when_unneeded_queue(u);
+ unit_submit_to_start_when_upheld_queue(u);
+ unit_submit_to_stop_when_bound_queue(u);
+
+ return 0;
+}
+
+static int manager_ratelimit_check_and_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (ratelimit_below(&u->auto_start_stop_ratelimit))
+ return 1;
+
+ /* Already queued, no need to requeue */
+ if (u->auto_start_stop_event_source)
+ return 0;
+
+ r = sd_event_add_time(
+ u->manager->event,
+ &u->auto_start_stop_event_source,
+ CLOCK_MONOTONIC,
+ ratelimit_end(&u->auto_start_stop_ratelimit),
+ 0,
+ manager_ratelimit_requeue,
+ u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m");
+
+ return 0;
+}
+
+static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = m->stop_when_unneeded_queue)) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ assert(u->in_stop_when_unneeded_queue);
+ LIST_REMOVE(stop_when_unneeded_queue, m->stop_when_unneeded_queue, u);
+ u->in_stop_when_unneeded_queue = false;
+
+ n++;
+
+ if (!unit_is_unneeded(u))
+ continue;
+
+ log_unit_debug(u, "Unit is not needed anymore.");
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit not needed anymore, but not stopping since we tried this too often recently.%s",
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */
+ r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = m->start_when_upheld_queue)) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *culprit = NULL;
+
+ assert(u->in_start_when_upheld_queue);
+ LIST_REMOVE(start_when_upheld_queue, m->start_when_upheld_queue, u);
+ u->in_start_when_upheld_queue = false;
+
+ n++;
+
+ if (!unit_is_upheld_by_active(u, &culprit))
+ continue;
+
+ log_unit_debug(u, "Unit is started because upheld by active unit %s.", culprit->id);
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s",
+ culprit->id,
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ r = manager_add_job(u->manager, JOB_START, u, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue start job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = m->stop_when_bound_queue)) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *culprit = NULL;
+
+ assert(u->in_stop_when_bound_queue);
+ LIST_REMOVE(stop_when_bound_queue, m->stop_when_bound_queue, u);
+ u->in_stop_when_bound_queue = false;
+
+ n++;
+
+ if (!unit_is_bound_by_inactive(u, &culprit))
+ continue;
+
+ log_unit_debug(u, "Unit is stopped because bound to inactive unit %s.", culprit->id);
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s",
+ culprit->id,
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ r = manager_add_job(u->manager, JOB_STOP, u, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static void manager_clear_jobs_and_units(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ while ((u = hashmap_first(m->units)))
+ unit_free(u);
+
+ manager_dispatch_cleanup_queue(m);
+
+ assert(!m->load_queue);
+ assert(prioq_isempty(m->run_queue));
+ assert(!m->dbus_unit_queue);
+ assert(!m->dbus_job_queue);
+ assert(!m->cleanup_queue);
+ assert(!m->gc_unit_queue);
+ assert(!m->gc_job_queue);
+ assert(!m->cgroup_realize_queue);
+ assert(!m->cgroup_empty_queue);
+ assert(!m->cgroup_oom_queue);
+ assert(!m->target_deps_queue);
+ assert(!m->stop_when_unneeded_queue);
+ assert(!m->start_when_upheld_queue);
+ assert(!m->stop_when_bound_queue);
+
+ assert(hashmap_isempty(m->jobs));
+ assert(hashmap_isempty(m->units));
+
+ m->n_on_console = 0;
+ m->n_running_jobs = 0;
+ m->n_installed_jobs = 0;
+ m->n_failed_jobs = 0;
+}
+
+Manager* manager_free(Manager *m) {
+ if (!m)
+ return NULL;
+
+ manager_clear_jobs_and_units(m);
+
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++)
+ if (unit_vtable[c]->shutdown)
+ unit_vtable[c]->shutdown(m);
+
+ /* Keep the cgroup hierarchy in place except when we know we are going down for good */
+ manager_shutdown_cgroup(m, IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC));
+
+ lookup_paths_flush_generator(&m->lookup_paths);
+
+ bus_done(m);
+ manager_varlink_done(m);
+
+ exec_runtime_vacuum(m);
+ hashmap_free(m->exec_runtime_by_id);
+
+ dynamic_user_vacuum(m, false);
+ hashmap_free(m->dynamic_users);
+
+ hashmap_free(m->units);
+ hashmap_free(m->units_by_invocation_id);
+ hashmap_free(m->jobs);
+ hashmap_free(m->watch_pids);
+ hashmap_free(m->watch_bus);
+
+ prioq_free(m->run_queue);
+
+ set_free(m->startup_units);
+ set_free(m->failed_units);
+
+ sd_event_source_unref(m->signal_event_source);
+ sd_event_source_unref(m->sigchld_event_source);
+ sd_event_source_unref(m->notify_event_source);
+ sd_event_source_unref(m->cgroups_agent_event_source);
+ sd_event_source_unref(m->time_change_event_source);
+ sd_event_source_unref(m->timezone_change_event_source);
+ sd_event_source_unref(m->jobs_in_progress_event_source);
+ sd_event_source_unref(m->run_queue_event_source);
+ sd_event_source_unref(m->user_lookup_event_source);
+
+ safe_close(m->signal_fd);
+ safe_close(m->notify_fd);
+ safe_close(m->cgroups_agent_fd);
+ safe_close_pair(m->user_lookup_fds);
+
+ manager_close_ask_password(m);
+
+ manager_close_idle_pipe(m);
+
+ sd_event_unref(m->event);
+
+ free(m->notify_socket);
+
+ lookup_paths_free(&m->lookup_paths);
+ strv_free(m->transient_environment);
+ strv_free(m->client_environment);
+
+ hashmap_free(m->cgroup_unit);
+ manager_free_unit_name_maps(m);
+
+ free(m->switch_root);
+ free(m->switch_root_init);
+
+ free(m->default_smack_process_label);
+
+ rlimit_free_all(m->rlimit);
+
+ assert(hashmap_isempty(m->units_requiring_mounts_for));
+ hashmap_free(m->units_requiring_mounts_for);
+
+ hashmap_free(m->uid_refs);
+ hashmap_free(m->gid_refs);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+ m->prefix[dt] = mfree(m->prefix[dt]);
+ free(m->received_credentials_directory);
+ free(m->received_encrypted_credentials_directory);
+
+ free(m->watchdog_pretimeout_governor);
+ free(m->watchdog_pretimeout_governor_overridden);
+
+#if BPF_FRAMEWORK
+ lsm_bpf_destroy(m->restrict_fs);
+#endif
+
+ return mfree(m);
+}
+
+static void manager_enumerate_perpetual(Manager *m) {
+ assert(m);
+
+ if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ return;
+
+ /* Let's ask every type to load all units from disk/kernel that it might know */
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+ if (!unit_type_supported(c)) {
+ log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+ continue;
+ }
+
+ if (unit_vtable[c]->enumerate_perpetual)
+ unit_vtable[c]->enumerate_perpetual(m);
+ }
+}
+
+static void manager_enumerate(Manager *m) {
+ assert(m);
+
+ if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ return;
+
+ /* Let's ask every type to load all units from disk/kernel that it might know */
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+ if (!unit_type_supported(c)) {
+ log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+ continue;
+ }
+
+ if (unit_vtable[c]->enumerate)
+ unit_vtable[c]->enumerate(m);
+ }
+
+ manager_dispatch_load_queue(m);
+}
+
+static void manager_coldplug(Manager *m) {
+ Unit *u;
+ char *k;
+ int r;
+
+ assert(m);
+
+ log_debug("Invoking unit coldplug() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ /* Let's place the units back into their deserialized state */
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ r = unit_coldplug(u);
+ if (r < 0)
+ log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id);
+ }
+}
+
+static void manager_catchup(Manager *m) {
+ Unit *u;
+ char *k;
+
+ assert(m);
+
+ log_debug("Invoking unit catchup() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ /* Let's catch up on any state changes that happened while we were reloading/reexecing */
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ unit_catchup(u);
+ }
+}
+
+static void manager_distribute_fds(Manager *m, FDSet *fds) {
+ Unit *u;
+
+ assert(m);
+
+ HASHMAP_FOREACH(u, m->units) {
+
+ if (fdset_size(fds) <= 0)
+ break;
+
+ if (!UNIT_VTABLE(u)->distribute_fds)
+ continue;
+
+ UNIT_VTABLE(u)->distribute_fds(u, fds);
+ }
+}
+
+static bool manager_dbus_is_running(Manager *m, bool deserialized) {
+ Unit *u;
+
+ assert(m);
+
+ /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket
+ * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit
+ * rather than the current one. */
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return false;
+
+ u = manager_get_unit(m, SPECIAL_DBUS_SOCKET);
+ if (!u)
+ return false;
+ if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING)
+ return false;
+
+ u = manager_get_unit(m, SPECIAL_DBUS_SERVICE);
+ if (!u)
+ return false;
+ if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state), SERVICE_RUNNING, SERVICE_RELOAD))
+ return false;
+
+ return true;
+}
+
+static void manager_setup_bus(Manager *m) {
+ assert(m);
+
+ /* Let's set up our private bus connection now, unconditionally */
+ (void) bus_init_private(m);
+
+ /* If we are in --user mode also connect to the system bus now */
+ if (MANAGER_IS_USER(m))
+ (void) bus_init_system(m);
+
+ /* Let's connect to the bus now, but only if the unit is supposed to be up */
+ if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) {
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ }
+}
+
+static void manager_preset_all(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (m->first_boot <= 0)
+ return;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return;
+
+ /* If this is the first boot, and we are in the host system, then preset everything */
+ UnitFilePresetMode mode =
+ ENABLE_FIRST_BOOT_FULL_PRESET ? UNIT_FILE_PRESET_FULL : UNIT_FILE_PRESET_ENABLE_ONLY;
+
+ r = unit_file_preset_all(LOOKUP_SCOPE_SYSTEM, 0, NULL, mode, NULL, 0);
+ if (r < 0)
+ log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r,
+ "Failed to populate /etc with preset unit settings, ignoring: %m");
+ else
+ log_info("Populated /etc with preset unit settings.");
+}
+
+static void manager_ready(Manager *m) {
+ assert(m);
+
+ /* After having loaded everything, do the final round of catching up with what might have changed */
+
+ m->objective = MANAGER_OK; /* Tell everyone we are up now */
+
+ /* It might be safe to log to the journal now and connect to dbus */
+ manager_recheck_journal(m);
+ manager_recheck_dbus(m);
+
+ /* Let's finally catch up with any changes that took place while we were reloading/reexecing */
+ manager_catchup(m);
+
+ /* Create a file which will indicate when the manager started loading units the last time. */
+ if (MANAGER_IS_SYSTEM(m))
+ (void) touch_file("/run/systemd/systemd-units-load", false,
+ m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].realtime ?: now(CLOCK_REALTIME),
+ UID_INVALID, GID_INVALID, 0444);
+}
+
+Manager* manager_reloading_start(Manager *m) {
+ m->n_reloading++;
+ dual_timestamp_get(m->timestamps + MANAGER_TIMESTAMP_UNITS_LOAD);
+ return m;
+}
+
+void manager_reloading_stopp(Manager **m) {
+ if (*m) {
+ assert((*m)->n_reloading > 0);
+ (*m)->n_reloading--;
+ }
+}
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root) {
+ int r;
+
+ assert(m);
+
+ /* If we are running in test mode, we still want to run the generators,
+ * but we should not touch the real generator directories. */
+ r = lookup_paths_init_or_warn(&m->lookup_paths, m->unit_file_scope,
+ MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0,
+ root);
+ if (r < 0)
+ return r;
+
+ dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START));
+ r = manager_run_environment_generators(m);
+ if (r >= 0)
+ r = manager_run_generators(m);
+ dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH));
+ if (r < 0)
+ return r;
+
+ manager_preset_all(m);
+
+ lookup_paths_log(&m->lookup_paths);
+
+ {
+ /* This block is (optionally) done with the reloading counter bumped */
+ _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+
+ /* Make sure we don't have a left-over from a previous run */
+ if (!serialization)
+ (void) rm_rf(m->lookup_paths.transient, 0);
+
+ /* If we will deserialize make sure that during enumeration this is already known, so we increase the
+ * counter here already */
+ if (serialization)
+ reloading = manager_reloading_start(m);
+
+ /* First, enumerate what we can from all config files */
+ dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START));
+ manager_enumerate_perpetual(m);
+ manager_enumerate(m);
+ dual_timestamp_get(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH));
+
+ /* Second, deserialize if there is something to deserialize */
+ if (serialization) {
+ r = manager_deserialize(m, serialization, fds);
+ if (r < 0)
+ return log_error_errno(r, "Deserialization failed: %m");
+ }
+
+ /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass
+ * some file descriptors to us pre-initialized. This enables socket-based activation of entire
+ * containers. */
+ manager_distribute_fds(m, fds);
+
+ /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */
+ r = manager_setup_notify(m);
+ if (r < 0)
+ /* No sense to continue without notifications, our children would fail anyway. */
+ return r;
+
+ r = manager_setup_cgroups_agent(m);
+ if (r < 0)
+ /* Likewise, no sense to continue without empty cgroup notifications. */
+ return r;
+
+ r = manager_setup_user_lookup_fd(m);
+ if (r < 0)
+ /* This shouldn't fail, except if things are really broken. */
+ return r;
+
+ /* Connect to the bus if we are good for it */
+ manager_setup_bus(m);
+
+ /* Now that we are connected to all possible buses, let's deserialize who is tracking us. */
+ r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed);
+ if (r < 0)
+ log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m");
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+ r = manager_varlink_init(m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set up Varlink, ignoring: %m");
+
+ /* Third, fire things up! */
+ manager_coldplug(m);
+
+ /* Clean up runtime objects */
+ manager_vacuum(m);
+
+ if (serialization)
+ /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the
+ * reload is finished */
+ m->send_reloading_done = true;
+ }
+
+ manager_ready(m);
+
+ manager_set_switching_root(m, false);
+
+ return 0;
+}
+
+int manager_add_job(
+ Manager *m,
+ JobType type,
+ Unit *unit,
+ JobMode mode,
+ Set *affected_jobs,
+ sd_bus_error *error,
+ Job **ret) {
+
+ Transaction *tr;
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(unit);
+ assert(mode < _JOB_MODE_MAX);
+
+ if (mode == JOB_ISOLATE && type != JOB_START)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start.");
+
+ if (mode == JOB_ISOLATE && !unit->allow_isolate)
+ return sd_bus_error_set(error, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated.");
+
+ if (mode == JOB_TRIGGERING && type != JOB_STOP)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop.");
+
+ log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode));
+
+ type = job_type_collapse(type, unit);
+
+ tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+ if (!tr)
+ return -ENOMEM;
+
+ r = transaction_add_job_and_dependencies(tr, type, unit, NULL, true, false,
+ IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS),
+ mode == JOB_IGNORE_DEPENDENCIES, error);
+ if (r < 0)
+ goto tr_abort;
+
+ if (mode == JOB_ISOLATE) {
+ r = transaction_add_isolate_jobs(tr, m);
+ if (r < 0)
+ goto tr_abort;
+ }
+
+ if (mode == JOB_TRIGGERING) {
+ r = transaction_add_triggering_jobs(tr, unit);
+ if (r < 0)
+ goto tr_abort;
+ }
+
+ r = transaction_activate(tr, m, mode, affected_jobs, error);
+ if (r < 0)
+ goto tr_abort;
+
+ log_unit_debug(unit,
+ "Enqueued job %s/%s as %u", unit->id,
+ job_type_to_string(type), (unsigned) tr->anchor_job->id);
+
+ if (ret)
+ *ret = tr->anchor_job;
+
+ transaction_free(tr);
+ return 0;
+
+tr_abort:
+ transaction_abort(tr);
+ transaction_free(tr);
+ return r;
+}
+
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **ret) {
+ Unit *unit = NULL; /* just to appease gcc, initialization is not really necessary */
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(name);
+ assert(mode < _JOB_MODE_MAX);
+
+ r = manager_load_unit(m, name, NULL, NULL, &unit);
+ if (r < 0)
+ return r;
+ assert(unit);
+
+ return manager_add_job(m, type, unit, mode, affected_jobs, e, ret);
+}
+
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(name);
+ assert(mode < _JOB_MODE_MAX);
+
+ r = manager_add_job_by_name(m, type, name, mode, affected_jobs, &error, ret);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r));
+
+ return r;
+}
+
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) {
+ int r;
+ Transaction *tr;
+
+ assert(m);
+ assert(unit);
+ assert(mode < _JOB_MODE_MAX);
+ assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */
+
+ tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+ if (!tr)
+ return -ENOMEM;
+
+ /* We need an anchor job */
+ r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, false, false, true, true, e);
+ if (r < 0)
+ goto tr_abort;
+
+ /* Failure in adding individual dependencies is ignored, so this always succeeds. */
+ transaction_add_propagate_reload_jobs(tr, unit, tr->anchor_job, mode == JOB_IGNORE_DEPENDENCIES, e);
+
+ r = transaction_activate(tr, m, mode, NULL, e);
+ if (r < 0)
+ goto tr_abort;
+
+ transaction_free(tr);
+ return 0;
+
+tr_abort:
+ transaction_abort(tr);
+ transaction_free(tr);
+ return r;
+}
+
+Job *manager_get_job(Manager *m, uint32_t id) {
+ assert(m);
+
+ return hashmap_get(m->jobs, UINT32_TO_PTR(id));
+}
+
+Unit *manager_get_unit(Manager *m, const char *name) {
+ assert(m);
+ assert(name);
+
+ return hashmap_get(m->units, name);
+}
+
+static int manager_dispatch_target_deps_queue(Manager *m) {
+ Unit *u;
+ int r = 0;
+
+ assert(m);
+
+ while ((u = m->target_deps_queue)) {
+ _cleanup_free_ Unit **targets = NULL;
+ int n_targets;
+
+ assert(u->in_target_deps_queue);
+
+ LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u);
+ u->in_target_deps_queue = false;
+
+ /* Take an "atomic" snapshot of dependencies here, as the call below will likely modify the
+ * dependencies, and we can't have it that hash tables we iterate through are modified while
+ * we are iterating through them. */
+ n_targets = unit_get_dependency_array(u, UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, &targets);
+ if (n_targets < 0)
+ return n_targets;
+
+ for (int i = 0; i < n_targets; i++) {
+ r = unit_add_default_target_dependency(u, targets[i]);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return r;
+}
+
+unsigned manager_dispatch_load_queue(Manager *m) {
+ Unit *u;
+ unsigned n = 0;
+
+ assert(m);
+
+ /* Make sure we are not run recursively */
+ if (m->dispatching_load_queue)
+ return 0;
+
+ m->dispatching_load_queue = true;
+
+ /* Dispatches the load queue. Takes a unit from the queue and
+ * tries to load its data until the queue is empty */
+
+ while ((u = m->load_queue)) {
+ assert(u->in_load_queue);
+
+ unit_load(u);
+ n++;
+ }
+
+ m->dispatching_load_queue = false;
+
+ /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about
+ * should be loaded and have aliases resolved */
+ (void) manager_dispatch_target_deps_queue(m);
+
+ return n;
+}
+
+bool manager_unit_cache_should_retry_load(Unit *u) {
+ assert(u);
+
+ /* Automatic reloading from disk only applies to units which were not found sometime in the past, and
+ * the not-found stub is kept pinned in the unit graph by dependencies. For units that were
+ * previously loaded, we don't do automatic reloading, and daemon-reload is necessary to update. */
+ if (u->load_state != UNIT_NOT_FOUND)
+ return false;
+
+ /* The cache has been updated since the last time we tried to load the unit. There might be new
+ * fragment paths to read. */
+ if (u->manager->unit_cache_timestamp_hash != u->fragment_not_found_timestamp_hash)
+ return true;
+
+ /* The cache needs to be updated because there are modifications on disk. */
+ return !lookup_paths_timestamp_hash_same(&u->manager->lookup_paths, u->manager->unit_cache_timestamp_hash, NULL);
+}
+
+int manager_load_unit_prepare(
+ Manager *m,
+ const char *name,
+ const char *path,
+ sd_bus_error *e,
+ Unit **ret) {
+
+ _cleanup_(unit_freep) Unit *cleanup_unit = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* This will prepare the unit for loading, but not actually load anything from disk. */
+
+ if (path && !path_is_absolute(path))
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+ if (!name) {
+ /* 'name' and 'path' must not both be null. Check here 'path' using assert_se() to
+ * workaround a bug in gcc that generates a -Wnonnull warning when calling basename(),
+ * but this cannot be possible in any code path (See #6119). */
+ assert_se(path);
+ name = basename(path);
+ }
+
+ UnitType t = unit_name_to_type(name);
+
+ if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE))
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name);
+
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name);
+ }
+
+ Unit *unit = manager_get_unit(m, name);
+ if (unit) {
+ /* The time-based cache allows to start new units without daemon-reload,
+ * but if they are already referenced (because of dependencies or ordering)
+ * then we have to force a load of the fragment. As an optimization, check
+ * first if anything in the usual paths was modified since the last time
+ * the cache was loaded. Also check if the last time an attempt to load the
+ * unit was made was before the most recent cache refresh, so that we know
+ * we need to try again — even if the cache is current, it might have been
+ * updated in a different context before we had a chance to retry loading
+ * this particular unit. */
+ if (manager_unit_cache_should_retry_load(unit))
+ unit->load_state = UNIT_STUB;
+ else {
+ *ret = unit;
+ return 0; /* The unit was already loaded */
+ }
+ } else {
+ unit = cleanup_unit = unit_new(m, unit_vtable[t]->object_size);
+ if (!unit)
+ return -ENOMEM;
+ }
+
+ if (path) {
+ r = free_and_strdup(&unit->fragment_path, path);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_name(unit, name);
+ if (r < 0)
+ return r;
+
+ unit_add_to_load_queue(unit);
+ unit_add_to_dbus_queue(unit);
+ unit_add_to_gc_queue(unit);
+
+ *ret = unit;
+ TAKE_PTR(cleanup_unit);
+
+ return 1; /* The unit was added the load queue */
+}
+
+int manager_load_unit(
+ Manager *m,
+ const char *name,
+ const char *path,
+ sd_bus_error *e,
+ Unit **ret) {
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* This will load the unit config, but not actually start any services or anything. */
+
+ r = manager_load_unit_prepare(m, name, path, e, ret);
+ if (r <= 0)
+ return r;
+
+ /* Unit was newly loaded */
+ manager_dispatch_load_queue(m);
+ *ret = unit_follow_merge(*ret);
+ return 0;
+}
+
+int manager_load_startable_unit_or_warn(
+ Manager *m,
+ const char *name,
+ const char *path,
+ Unit **ret) {
+
+ /* Load a unit, make sure it loaded fully and is not masked. */
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *unit;
+ int r;
+
+ r = manager_load_unit(m, name, path, &error, &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load %s %s: %s",
+ name ? "unit" : "unit file", name ?: path,
+ bus_error_message(&error, r));
+
+ r = bus_unit_validate_load_state(unit, &error);
+ if (r < 0)
+ return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+ *ret = unit;
+ return 0;
+}
+
+void manager_clear_jobs(Manager *m) {
+ Job *j;
+
+ assert(m);
+
+ while ((j = hashmap_first(m->jobs)))
+ /* No need to recurse. We're cancelling all jobs. */
+ job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+}
+
+void manager_unwatch_pid(Manager *m, pid_t pid) {
+ assert(m);
+
+ /* First let's drop the unit keyed as "pid". */
+ (void) hashmap_remove(m->watch_pids, PID_TO_PTR(pid));
+
+ /* Then, let's also drop the array keyed by -pid. */
+ free(hashmap_remove(m->watch_pids, PID_TO_PTR(-pid)));
+}
+
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+
+ assert(source);
+
+ while ((j = prioq_peek(m->run_queue))) {
+ assert(j->installed);
+ assert(j->in_run_queue);
+
+ (void) job_run_and_invalidate(j);
+ }
+
+ if (m->n_running_jobs > 0)
+ manager_watch_jobs_in_progress(m);
+
+ if (m->n_on_console > 0)
+ manager_watch_idle_pipe(m);
+
+ return 1;
+}
+
+void manager_trigger_run_queue(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = sd_event_source_set_enabled(
+ m->run_queue_event_source,
+ prioq_isempty(m->run_queue) ? SD_EVENT_OFF : SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m");
+}
+
+static unsigned manager_dispatch_dbus_queue(Manager *m) {
+ unsigned n = 0, budget;
+ Unit *u;
+ Job *j;
+
+ assert(m);
+
+ /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly
+ * as we can. There's no point in throttling generation of signals in that case. */
+ if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message)
+ budget = UINT_MAX; /* infinite budget in this case */
+ else {
+ /* Anything to do at all? */
+ if (!m->dbus_unit_queue && !m->dbus_job_queue)
+ return 0;
+
+ /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's
+ * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */
+ if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD)
+ return 0;
+
+ /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't
+ * overly full before this call we shouldn't increase it in size too wildly in one step, and we
+ * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of
+ * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message,
+ * regardless how many buses/direct connections it is enqueued on, while the "threshold" is applied to
+ * each queued instance of bus message, i.e. if the same message is enqueued to five buses/direct
+ * connections it will be counted five times. This difference in counting ("references"
+ * vs. "instances") is primarily a result of the fact that it's easier to implement it this way,
+ * however it also reflects the thinking that the "threshold" should put a limit on used queue memory,
+ * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is
+ * currently chosen much higher than the "budget". */
+ budget = MANAGER_BUS_MESSAGE_BUDGET;
+ }
+
+ while (budget != 0 && (u = m->dbus_unit_queue)) {
+
+ assert(u->in_dbus_queue);
+
+ bus_unit_send_change_signal(u);
+ n++;
+
+ if (budget != UINT_MAX)
+ budget--;
+ }
+
+ while (budget != 0 && (j = m->dbus_job_queue)) {
+ assert(j->in_dbus_queue);
+
+ bus_job_send_change_signal(j);
+ n++;
+
+ if (budget != UINT_MAX)
+ budget--;
+ }
+
+ if (m->send_reloading_done) {
+ m->send_reloading_done = false;
+ bus_manager_send_reloading(m, false);
+ n++;
+ }
+
+ if (m->pending_reload_message) {
+ bus_send_pending_reload_message(m);
+ n++;
+ }
+
+ return n;
+}
+
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = userdata;
+ char buf[PATH_MAX];
+ ssize_t n;
+
+ n = recv(fd, buf, sizeof(buf), 0);
+ if (n < 0)
+ return log_error_errno(errno, "Failed to read cgroups agent message: %m");
+ if (n == 0) {
+ log_error("Got zero-length cgroups agent message, ignoring.");
+ return 0;
+ }
+ if ((size_t) n >= sizeof(buf)) {
+ log_error("Got overly long cgroups agent message, ignoring.");
+ return 0;
+ }
+
+ if (memchr(buf, 0, n)) {
+ log_error("Got cgroups agent message with embedded NUL byte, ignoring.");
+ return 0;
+ }
+ buf[n] = 0;
+
+ manager_notify_cgroup_empty(m, buf);
+ (void) bus_forward_agent_released(m, buf);
+
+ return 0;
+}
+
+static bool manager_process_barrier_fd(char * const *tags, FDSet *fds) {
+
+ /* nothing else must be sent when using BARRIER=1 */
+ if (strv_contains(tags, "BARRIER=1")) {
+ if (strv_length(tags) == 1) {
+ if (fdset_size(fds) != 1)
+ log_warning("Got incorrect number of fds with BARRIER=1, closing them.");
+ } else
+ log_warning("Extra notification messages sent with BARRIER=1, ignoring everything.");
+
+ /* Drop the message if BARRIER=1 was found */
+ return true;
+ }
+
+ return false;
+}
+
+static void manager_invoke_notify_message(
+ Manager *m,
+ Unit *u,
+ const struct ucred *ucred,
+ char * const *tags,
+ FDSet *fds) {
+
+ assert(m);
+ assert(u);
+ assert(ucred);
+ assert(tags);
+
+ if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */
+ return;
+ u->notifygen = m->notifygen;
+
+ if (UNIT_VTABLE(u)->notify_message)
+ UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds);
+
+ else if (DEBUG_LOGGING) {
+ _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL;
+
+ buf = strv_join(tags, ", ");
+ if (buf)
+ x = ellipsize(buf, 20, 90);
+ if (x)
+ y = cescape(x);
+
+ log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y));
+ }
+}
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ char buf[NOTIFY_BUFFER_MAX+1];
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
+ CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+
+ struct cmsghdr *cmsg;
+ struct ucred *ucred = NULL;
+ _cleanup_free_ Unit **array_copy = NULL;
+ _cleanup_strv_free_ char **tags = NULL;
+ Unit *u1, *u2, **array;
+ int r, *fd_array = NULL;
+ size_t n_fds = 0;
+ bool found = false;
+ ssize_t n;
+
+ assert(m->notify_fd == fd);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for notify fd.");
+ return 0;
+ }
+
+ n = recvmsg_safe(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
+ if (n < 0) {
+ if (ERRNO_IS_TRANSIENT(n))
+ return 0; /* Spurious wakeup, try again */
+ if (n == -EXFULL) {
+ log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
+ return 0;
+ }
+ /* If this is any other, real error, then let's stop processing this socket. This of course
+ * means we won't take notification messages anymore, but that's still better than busy
+ * looping around this: being woken up over and over again but being unable to actually read
+ * the message off the socket. */
+ return log_error_errno(n, "Failed to receive notification message: %m");
+ }
+
+ CMSG_FOREACH(cmsg, &msghdr) {
+ if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+
+ assert(!fd_array);
+ fd_array = (int*) CMSG_DATA(cmsg);
+ n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ } else if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_CREDENTIALS &&
+ cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+ assert(!ucred);
+ ucred = (struct ucred*) CMSG_DATA(cmsg);
+ }
+ }
+
+ if (n_fds > 0) {
+ assert(fd_array);
+
+ r = fdset_new_array(&fds, fd_array, n_fds);
+ if (r < 0) {
+ close_many(fd_array, n_fds);
+ log_oom();
+ return 0;
+ }
+ }
+
+ if (!ucred || !pid_is_valid(ucred->pid)) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes.
+ * We permit one trailing NUL byte in the message, but don't expect it. */
+ if (n > 1 && memchr(buf, 0, n-1)) {
+ log_warning("Received notify message with embedded NUL bytes. Ignoring.");
+ return 0;
+ }
+
+ /* Make sure it's NUL-terminated, then parse it to obtain the tags list. */
+ buf[n] = 0;
+ tags = strv_split_newlines(buf);
+ if (!tags) {
+ log_oom();
+ return 0;
+ }
+
+ /* Possibly a barrier fd, let's see. */
+ if (manager_process_barrier_fd(tags, fds))
+ return 0;
+
+ /* Increase the generation counter used for filtering out duplicate unit invocations. */
+ m->notifygen++;
+
+ /* Notify every unit that might be interested, which might be multiple. */
+ u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid);
+ u2 = hashmap_get(m->watch_pids, PID_TO_PTR(ucred->pid));
+ array = hashmap_get(m->watch_pids, PID_TO_PTR(-ucred->pid));
+ if (array) {
+ size_t k = 0;
+
+ while (array[k])
+ k++;
+
+ array_copy = newdup(Unit*, array, k+1);
+ if (!array_copy)
+ log_oom();
+ }
+ /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle
+ * duplicate units make sure we only invoke each unit's handler once. */
+ if (u1) {
+ manager_invoke_notify_message(m, u1, ucred, tags, fds);
+ found = true;
+ }
+ if (u2) {
+ manager_invoke_notify_message(m, u2, ucred, tags, fds);
+ found = true;
+ }
+ if (array_copy)
+ for (size_t i = 0; array_copy[i]; i++) {
+ manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds);
+ found = true;
+ }
+
+ if (!found)
+ log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
+
+ if (fdset_size(fds) > 0)
+ log_warning("Got extra auxiliary fds with notification message, closing them.");
+
+ return 0;
+}
+
+static void manager_invoke_sigchld_event(
+ Manager *m,
+ Unit *u,
+ const siginfo_t *si) {
+
+ assert(m);
+ assert(u);
+ assert(si);
+
+ /* Already invoked the handler of this unit in this iteration? Then don't process this again */
+ if (u->sigchldgen == m->sigchldgen)
+ return;
+ u->sigchldgen = m->sigchldgen;
+
+ log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id);
+ unit_unwatch_pid(u, si->si_pid);
+
+ if (UNIT_VTABLE(u)->sigchld_event)
+ UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
+}
+
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ siginfo_t si = {};
+ int r;
+
+ assert(source);
+
+ /* First we call waitid() for a PID and do not reap the zombie. That way we can still access
+ * /proc/$PID for it while it is a zombie. */
+
+ if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) {
+
+ if (errno != ECHILD)
+ log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m");
+
+ goto turn_off;
+ }
+
+ if (si.si_pid <= 0)
+ goto turn_off;
+
+ if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) {
+ _cleanup_free_ Unit **array_copy = NULL;
+ _cleanup_free_ char *name = NULL;
+ Unit *u1, *u2, **array;
+
+ (void) get_process_comm(si.si_pid, &name);
+
+ log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)",
+ si.si_pid, strna(name),
+ sigchld_code_to_string(si.si_code),
+ si.si_status,
+ strna(si.si_code == CLD_EXITED
+ ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
+ : signal_to_string(si.si_status)));
+
+ /* Increase the generation counter used for filtering out duplicate unit invocations */
+ m->sigchldgen++;
+
+ /* And now figure out the unit this belongs to, it might be multiple... */
+ u1 = manager_get_unit_by_pid_cgroup(m, si.si_pid);
+ u2 = hashmap_get(m->watch_pids, PID_TO_PTR(si.si_pid));
+ array = hashmap_get(m->watch_pids, PID_TO_PTR(-si.si_pid));
+ if (array) {
+ size_t n = 0;
+
+ /* Count how many entries the array has */
+ while (array[n])
+ n++;
+
+ /* Make a copy of the array so that we don't trip up on the array changing beneath us */
+ array_copy = newdup(Unit*, array, n+1);
+ if (!array_copy)
+ log_oom();
+ }
+
+ /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but
+ * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for
+ * each iteration. */
+ if (u1) {
+ /* We check for oom condition, in case we got SIGCHLD before the oom notification.
+ * We only do this for the cgroup the PID belonged to. */
+ (void) unit_check_oom(u1);
+
+ /* We check if systemd-oomd performed a kill so that we log and notify appropriately */
+ (void) unit_check_oomd_kill(u1);
+
+ manager_invoke_sigchld_event(m, u1, &si);
+ }
+ if (u2)
+ manager_invoke_sigchld_event(m, u2, &si);
+ if (array_copy)
+ for (size_t i = 0; array_copy[i]; i++)
+ manager_invoke_sigchld_event(m, array_copy[i], &si);
+ }
+
+ /* And now, we actually reap the zombie. */
+ if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) {
+ log_error_errno(errno, "Failed to dequeue child, ignoring: %m");
+ return 0;
+ }
+
+ return 0;
+
+turn_off:
+ /* All children processed for now, turn off event source */
+
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable SIGCHLD event source: %m");
+
+ return 0;
+}
+
+static void manager_start_special(Manager *m, const char *name, JobMode mode) {
+ Job *job;
+
+ if (manager_add_job_by_name_and_warn(m, JOB_START, name, mode, NULL, &job) < 0)
+ return;
+
+ const char *s = unit_status_string(job->unit, NULL);
+
+ log_info("Activating special unit %s...", s);
+
+ sd_notifyf(false,
+ "STATUS=Activating special unit %s...", s);
+ m->status_ready = false;
+}
+
+static void manager_handle_ctrl_alt_del(Manager *m) {
+ /* If the user presses C-A-D more than
+ * 7 times within 2s, we reboot/shutdown immediately,
+ * unless it was disabled in system.conf */
+
+ if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE)
+ manager_start_special(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY);
+ else
+ emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1,
+ "Ctrl-Alt-Del was pressed more than 7 times within 2s");
+}
+
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ ssize_t n;
+ struct signalfd_siginfo sfsi;
+ int r;
+
+ assert(m->signal_fd == fd);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected events from signal file descriptor.");
+ return 0;
+ }
+
+ n = read(m->signal_fd, &sfsi, sizeof(sfsi));
+ if (n < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ /* We return an error here, which will kill this handler,
+ * to avoid a busy loop on read error. */
+ return log_error_errno(errno, "Reading from signal fd failed: %m");
+ }
+ if (n != sizeof(sfsi)) {
+ log_warning("Truncated read from signal fd (%zi bytes), ignoring!", n);
+ return 0;
+ }
+
+ log_received_signal(sfsi.ssi_signo == SIGCHLD ||
+ (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m))
+ ? LOG_DEBUG : LOG_INFO,
+ &sfsi);
+
+ switch (sfsi.ssi_signo) {
+
+ case SIGCHLD:
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m");
+
+ break;
+
+ case SIGTERM:
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* This is for compatibility with the original sysvinit */
+ if (verify_run_space_and_log("Refusing to reexecute") < 0)
+ break;
+
+ m->objective = MANAGER_REEXECUTE;
+ break;
+ }
+
+ _fallthrough_;
+ case SIGINT:
+ if (MANAGER_IS_SYSTEM(m))
+ manager_handle_ctrl_alt_del(m);
+ else
+ manager_start_special(m, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY);
+ break;
+
+ case SIGWINCH:
+ /* This is a nop on non-init */
+ if (MANAGER_IS_SYSTEM(m))
+ manager_start_special(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE);
+
+ break;
+
+ case SIGPWR:
+ /* This is a nop on non-init */
+ if (MANAGER_IS_SYSTEM(m))
+ manager_start_special(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE);
+
+ break;
+
+ case SIGUSR1:
+ if (manager_dbus_is_running(m, false)) {
+ log_info("Trying to reconnect to bus...");
+
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ } else
+ manager_start_special(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE);
+
+ break;
+
+ case SIGUSR2: {
+ _cleanup_free_ char *dump = NULL;
+
+ r = manager_get_dump_string(m, /* patterns= */ NULL, &dump);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to acquire manager dump: %m");
+ break;
+ }
+
+ log_dump(LOG_INFO, dump);
+ break;
+ }
+
+ case SIGHUP:
+ if (verify_run_space_and_log("Refusing to reload") < 0)
+ break;
+
+ m->objective = MANAGER_RELOAD;
+ break;
+
+ default: {
+
+ /* Starting SIGRTMIN+0 */
+ static const struct {
+ const char *target;
+ JobMode mode;
+ } target_table[] = {
+ [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE },
+ [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE },
+ [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE },
+ [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [6] = { SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ };
+
+ /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */
+ static const ManagerObjective objective_table[] = {
+ [0] = MANAGER_HALT,
+ [1] = MANAGER_POWEROFF,
+ [2] = MANAGER_REBOOT,
+ [3] = MANAGER_KEXEC,
+ };
+
+ if ((int) sfsi.ssi_signo >= SIGRTMIN+0 &&
+ (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) {
+ int idx = (int) sfsi.ssi_signo - SIGRTMIN;
+ manager_start_special(m, target_table[idx].target, target_table[idx].mode);
+ break;
+ }
+
+ if ((int) sfsi.ssi_signo >= SIGRTMIN+13 &&
+ (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) {
+ m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13];
+ break;
+ }
+
+ switch (sfsi.ssi_signo - SIGRTMIN) {
+
+ case 20:
+ manager_override_show_status(m, SHOW_STATUS_YES, "signal");
+ break;
+
+ case 21:
+ manager_override_show_status(m, SHOW_STATUS_NO, "signal");
+ break;
+
+ case 22:
+ manager_override_log_level(m, LOG_DEBUG);
+ break;
+
+ case 23:
+ manager_restore_original_log_level(m);
+ break;
+
+ case 24:
+ if (MANAGER_IS_USER(m)) {
+ m->objective = MANAGER_EXIT;
+ return 0;
+ }
+
+ /* This is a nop on init */
+ break;
+
+ case 25:
+ m->objective = MANAGER_REEXECUTE;
+ break;
+
+ case 26:
+ case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */
+ manager_restore_original_log_target(m);
+ break;
+
+ case 27:
+ manager_override_log_target(m, LOG_TARGET_CONSOLE);
+ break;
+
+ case 28:
+ manager_override_log_target(m, LOG_TARGET_KMSG);
+ break;
+
+ default:
+ log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo));
+ }
+ }}
+
+ return 0;
+}
+
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+
+ log_struct(LOG_DEBUG,
+ "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR,
+ LOG_MESSAGE("Time has been changed"));
+
+ /* Restart the watch */
+ (void) manager_setup_time_change(m);
+
+ HASHMAP_FOREACH(u, m->units)
+ if (UNIT_VTABLE(u)->time_change)
+ UNIT_VTABLE(u)->time_change(u);
+
+ return 0;
+}
+
+static int manager_dispatch_timezone_change(
+ sd_event_source *source,
+ const struct inotify_event *e,
+ void *userdata) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ int changed;
+ Unit *u;
+
+ log_debug("inotify event for /etc/localtime");
+
+ changed = manager_read_timezone_stat(m);
+ if (changed <= 0)
+ return changed;
+
+ /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */
+ (void) manager_setup_timezone_change(m);
+
+ /* Read the new timezone */
+ tzset();
+
+ log_debug("Timezone has been changed (now: %s).", tzname[daylight]);
+
+ HASHMAP_FOREACH(u, m->units)
+ if (UNIT_VTABLE(u)->timezone_change)
+ UNIT_VTABLE(u)->timezone_change(u);
+
+ return 0;
+}
+
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(m->idle_pipe[2] == fd);
+
+ /* There's at least one Type=idle child that just gave up on us waiting for the boot process to
+ * complete. Let's now turn off any further console output if there's at least one service that needs
+ * console access, so that from now on our own output should not spill into that service's output
+ * anymore. After all, we support Type=idle only to beautify console output and it generally is set
+ * on services that want to own the console exclusively without our interference. */
+ m->no_console_output = m->n_on_console > 0;
+
+ /* Acknowledge the child's request, and let all all other children know too that they shouldn't wait
+ * any longer by closing the pipes towards them, which is what they are waiting for. */
+ manager_close_idle_pipe(m);
+
+ return 0;
+}
+
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(source);
+
+ manager_print_jobs_in_progress(m);
+
+ r = sd_event_source_set_time_relative(source, JOBS_IN_PROGRESS_PERIOD_USEC);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT);
+}
+
+int manager_loop(Manager *m) {
+ RateLimit rl = { .interval = 1*USEC_PER_SEC, .burst = 50000 };
+ int r;
+
+ assert(m);
+ assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */
+
+ manager_check_finished(m);
+
+ /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable SIGCHLD event source: %m");
+
+ while (m->objective == MANAGER_OK) {
+
+ (void) watchdog_ping();
+
+ if (!ratelimit_below(&rl)) {
+ /* Yay, something is going seriously wrong, pause a little */
+ log_warning("Looping too fast. Throttling execution a little.");
+ sleep(1);
+ }
+
+ if (manager_dispatch_load_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_gc_job_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_gc_unit_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_cleanup_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_cgroup_realize_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_start_when_upheld_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_stop_when_bound_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_stop_when_unneeded_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_dbus_queue(m) > 0)
+ continue;
+
+ /* Sleep for watchdog runtime wait time */
+ r = sd_event_run(m->event, watchdog_runtime_wait());
+ if (r < 0)
+ return log_error_errno(r, "Failed to run event loop: %m");
+ }
+
+ return m->objective;
+}
+
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) {
+ _cleanup_free_ char *n = NULL;
+ sd_id128_t invocation_id;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(s);
+ assert(_u);
+
+ r = unit_name_from_dbus_path(s, &n);
+ if (r < 0)
+ return r;
+
+ /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128bit ID then
+ * we use it as invocation ID. */
+ r = sd_id128_from_string(n, &invocation_id);
+ if (r >= 0) {
+ u = hashmap_get(m->units_by_invocation_id, &invocation_id);
+ if (u) {
+ *_u = u;
+ return 0;
+ }
+
+ return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID,
+ "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.",
+ SD_ID128_FORMAT_VAL(invocation_id));
+ }
+
+ /* If this didn't work, we check if this is a unit name */
+ if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+ _cleanup_free_ char *nn = NULL;
+
+ nn = cescape(n);
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn));
+ }
+
+ r = manager_load_unit(m, n, NULL, e, &u);
+ if (r < 0)
+ return r;
+
+ *_u = u;
+ return 0;
+}
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) {
+ const char *p;
+ unsigned id;
+ Job *j;
+ int r;
+
+ assert(m);
+ assert(s);
+ assert(_j);
+
+ p = startswith(s, "/org/freedesktop/systemd1/job/");
+ if (!p)
+ return -EINVAL;
+
+ r = safe_atou(p, &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return -ENOENT;
+
+ *_j = j;
+
+ return 0;
+}
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) {
+
+#if HAVE_AUDIT
+ _cleanup_free_ char *p = NULL;
+ const char *msg;
+ int audit_fd, r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ audit_fd = get_audit_fd();
+ if (audit_fd < 0)
+ return;
+
+ /* Don't generate audit events if the service was already
+ * started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ if (u->type != UNIT_SERVICE)
+ return;
+
+ r = unit_name_to_prefix_and_instance(u->id, &p);
+ if (r < 0) {
+ log_error_errno(r, "Failed to extract prefix and instance of unit name: %m");
+ return;
+ }
+
+ msg = strjoina("unit=", p);
+ if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) {
+ if (errno == EPERM)
+ /* We aren't allowed to send audit messages?
+ * Then let's not retry again. */
+ close_audit_fd();
+ else
+ log_warning_errno(errno, "Failed to send audit message: %m");
+ }
+#endif
+
+}
+
+void manager_send_unit_plymouth(Manager *m, Unit *u) {
+ static const union sockaddr_union sa = PLYMOUTH_SOCKET;
+ _cleanup_free_ char *message = NULL;
+ _cleanup_close_ int fd = -1;
+ int n = 0;
+
+ /* Don't generate plymouth events if the service was already
+ * started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (detect_container() > 0)
+ return;
+
+ if (!IN_SET(u->type, UNIT_SERVICE, UNIT_MOUNT, UNIT_SWAP))
+ return;
+
+ /* We set SOCK_NONBLOCK here so that we rather drop the
+ * message then wait for plymouth */
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0) {
+ log_error_errno(errno, "socket() failed: %m");
+ return;
+ }
+
+ if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
+ if (!IN_SET(errno, EAGAIN, ENOENT) && !ERRNO_IS_DISCONNECT(errno))
+ log_error_errno(errno, "connect() failed: %m");
+ return;
+ }
+
+ if (asprintf(&message, "U\002%c%s%n", (int) (strlen(u->id) + 1), u->id, &n) < 0)
+ return (void) log_oom();
+
+ errno = 0;
+ if (write(fd, message, n + 1) != n + 1)
+ if (!IN_SET(errno, EAGAIN, ENOENT) && !ERRNO_IS_DISCONNECT(errno))
+ log_error_errno(errno, "Failed to write Plymouth message: %m");
+}
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t) {
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return USEC_INFINITY;
+
+ if (timestamp_is_set(m->watchdog_overridden[t]))
+ return m->watchdog_overridden[t];
+
+ return m->watchdog[t];
+}
+
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (m->watchdog[t] == timeout)
+ return;
+
+ if (t == WATCHDOG_RUNTIME) {
+ if (!timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME]))
+ (void) watchdog_setup(timeout);
+ } else if (t == WATCHDOG_PRETIMEOUT)
+ if (m->watchdog_overridden[WATCHDOG_PRETIMEOUT] == USEC_INFINITY)
+ (void) watchdog_setup_pretimeout(timeout);
+
+ m->watchdog[t] = timeout;
+}
+
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (m->watchdog_overridden[t] == timeout)
+ return;
+
+ if (t == WATCHDOG_RUNTIME) {
+ usec_t usec = timestamp_is_set(timeout) ? timeout : m->watchdog[t];
+
+ (void) watchdog_setup(usec);
+ } else if (t == WATCHDOG_PRETIMEOUT)
+ (void) watchdog_setup_pretimeout(timeout);
+
+ m->watchdog_overridden[t] = timeout;
+}
+
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return 0;
+
+ if (streq_ptr(m->watchdog_pretimeout_governor, governor))
+ return 0;
+
+ p = strdup(governor);
+ if (!p)
+ return -ENOMEM;
+
+ r = watchdog_setup_pretimeout_governor(governor);
+ if (r < 0)
+ return r;
+
+ return free_and_replace(m->watchdog_pretimeout_governor, p);
+}
+
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return 0;
+
+ if (streq_ptr(m->watchdog_pretimeout_governor_overridden, governor))
+ return 0;
+
+ p = strdup(governor);
+ if (!p)
+ return -ENOMEM;
+
+ r = watchdog_setup_pretimeout_governor(governor);
+ if (r < 0)
+ return r;
+
+ return free_and_replace(m->watchdog_pretimeout_governor_overridden, p);
+}
+
+int manager_reload(Manager *m) {
+ _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(m);
+
+ r = manager_open_serialization(m, &f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create serialization file: %m");
+
+ fds = fdset_new();
+ if (!fds)
+ return log_oom();
+
+ /* We are officially in reload mode from here on. */
+ reloading = manager_reloading_start(m);
+
+ r = manager_serialize(m, f, fds, false);
+ if (r < 0)
+ return r;
+
+ if (fseeko(f, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to seek to beginning of serialization: %m");
+
+ /* 💀 This is the point of no return, from here on there is no way back. 💀 */
+ reloading = NULL;
+
+ bus_manager_send_reloading(m, true);
+
+ /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users
+ * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need
+ * it. */
+
+ manager_clear_jobs_and_units(m);
+ lookup_paths_flush_generator(&m->lookup_paths);
+ lookup_paths_free(&m->lookup_paths);
+ exec_runtime_vacuum(m);
+ dynamic_user_vacuum(m, false);
+ m->uid_refs = hashmap_free(m->uid_refs);
+ m->gid_refs = hashmap_free(m->gid_refs);
+
+ r = lookup_paths_init_or_warn(&m->lookup_paths, m->unit_file_scope, 0, NULL);
+ if (r < 0)
+ return r;
+
+ (void) manager_run_environment_generators(m);
+ (void) manager_run_generators(m);
+
+ lookup_paths_log(&m->lookup_paths);
+
+ /* We flushed out generated files, for which we don't watch mtime, so we should flush the old map. */
+ manager_free_unit_name_maps(m);
+
+ /* First, enumerate what we can from kernel and suchlike */
+ manager_enumerate_perpetual(m);
+ manager_enumerate(m);
+
+ /* Second, deserialize our stored data */
+ r = manager_deserialize(m, f, fds);
+ if (r < 0)
+ log_warning_errno(r, "Deserialization failed, proceeding anyway: %m");
+
+ /* We don't need the serialization anymore */
+ f = safe_fclose(f);
+
+ /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */
+ (void) manager_setup_notify(m);
+ (void) manager_setup_cgroups_agent(m);
+ (void) manager_setup_user_lookup_fd(m);
+
+ /* Third, fire things up! */
+ manager_coldplug(m);
+
+ /* Clean up runtime objects no longer referenced */
+ manager_vacuum(m);
+
+ /* Clean up deserialized tracked clients */
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+ /* Consider the reload process complete now. */
+ assert(m->n_reloading > 0);
+ m->n_reloading--;
+
+ manager_ready(m);
+
+ m->send_reloading_done = true;
+ return 0;
+}
+
+void manager_reset_failed(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ HASHMAP_FOREACH(u, m->units)
+ unit_reset_failed(u);
+}
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name) {
+ Unit *u;
+
+ assert(m);
+ assert(name);
+
+ /* Returns true if the unit is inactive or going down */
+ u = manager_get_unit(m, name);
+ if (!u)
+ return true;
+
+ return unit_inactive_or_pending(u);
+}
+
+static void log_taint_string(Manager *m) {
+ _cleanup_free_ char *taint = NULL;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m) || m->taint_logged)
+ return;
+
+ m->taint_logged = true; /* only check for taint once */
+
+ taint = manager_taint_string(m);
+ if (isempty(taint))
+ return;
+
+ log_struct(LOG_NOTICE,
+ LOG_MESSAGE("System is tainted: %s", taint),
+ "TAINT=%s", taint,
+ "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR);
+}
+
+static void manager_notify_finished(Manager *m) {
+ usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return;
+
+ if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) {
+ char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")]
+ = {};
+ char *p = buf;
+ size_t size = sizeof buf;
+
+ /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and
+ * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered
+ * negative values. */
+
+ firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic;
+ loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+ total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic;
+
+ if (firmware_usec > 0)
+ size = strpcpyf(&p, size, "%s (firmware) + ", FORMAT_TIMESPAN(firmware_usec, USEC_PER_MSEC));
+ if (loader_usec > 0)
+ size = strpcpyf(&p, size, "%s (loader) + ", FORMAT_TIMESPAN(loader_usec, USEC_PER_MSEC));
+
+ if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) {
+
+ /* The initrd case on bare-metal */
+ kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+ "KERNEL_USEC="USEC_FMT, kernel_usec,
+ "INITRD_USEC="USEC_FMT, initrd_usec,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.",
+ buf,
+ FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(initrd_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ } else {
+ /* The initrd-less case on bare-metal */
+
+ kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ initrd_usec = 0;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+ "KERNEL_USEC="USEC_FMT, kernel_usec,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.",
+ buf,
+ FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ }
+ } else {
+ /* The container and --user case */
+ firmware_usec = loader_usec = initrd_usec = kernel_usec = 0;
+ total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s.",
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ }
+
+ bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec);
+
+ log_taint_string(m);
+}
+
+static void user_manager_send_ready(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* We send READY=1 on reaching basic.target only when running in --user mode. */
+ if (!MANAGER_IS_USER(m) || m->ready_sent)
+ return;
+
+ r = sd_notify(false,
+ "READY=1\n"
+ "STATUS=Reached " SPECIAL_BASIC_TARGET ".");
+ if (r < 0)
+ log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+
+ m->ready_sent = true;
+ m->status_ready = false;
+}
+
+static void manager_send_ready(Manager *m) {
+ int r;
+
+ if (m->ready_sent && m->status_ready)
+ /* Skip the notification if nothing changed. */
+ return;
+
+ r = sd_notify(false,
+ "READY=1\n"
+ "STATUS=Ready.");
+ if (r < 0)
+ log_full_errno(m->ready_sent ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to send readiness notification, ignoring: %m");
+
+ m->ready_sent = m->status_ready = true;
+}
+
+static void manager_check_basic_target(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ /* Small shortcut */
+ if (m->ready_sent && m->taint_logged)
+ return;
+
+ u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+ if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ /* For user managers, send out READY=1 as soon as we reach basic.target */
+ user_manager_send_ready(m);
+
+ /* Log the taint string as soon as we reach basic.target */
+ log_taint_string(m);
+}
+
+void manager_check_finished(Manager *m) {
+ assert(m);
+
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ /* Verify that we have entered the event loop already, and not left it again. */
+ if (!MANAGER_IS_RUNNING(m))
+ return;
+
+ manager_check_basic_target(m);
+
+ if (hashmap_size(m->jobs) > 0) {
+ if (m->jobs_in_progress_event_source)
+ /* Ignore any failure, this is only for feedback */
+ (void) sd_event_source_set_time(m->jobs_in_progress_event_source,
+ manager_watch_jobs_next_time(m));
+ return;
+ }
+
+ /* The jobs hashmap tends to grow a lot during boot, and then it's not reused until shutdown. Let's
+ kill the hashmap if it is relatively large. */
+ if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10)
+ m->jobs = hashmap_free(m->jobs);
+
+ manager_send_ready(m);
+
+ /* Notify Type=idle units that we are done now */
+ manager_close_idle_pipe(m);
+
+ if (MANAGER_IS_FINISHED(m))
+ return;
+
+ manager_flip_auto_status(m, false, "boot finished");
+
+ /* Turn off confirm spawn now */
+ m->confirm_spawn = NULL;
+
+ /* No need to update ask password status when we're going non-interactive */
+ manager_close_ask_password(m);
+
+ /* This is no longer the first boot */
+ manager_set_first_boot(m, false);
+
+ dual_timestamp_get(m->timestamps + MANAGER_TIMESTAMP_FINISH);
+
+ manager_notify_finished(m);
+
+ manager_invalidate_startup_units(m);
+}
+
+static bool generator_path_any(const char* const* paths) {
+ bool found = false;
+
+ /* Optimize by skipping the whole process by not creating output directories
+ * if no generators are found. */
+ STRV_FOREACH(path, paths)
+ if (access(*path, F_OK) == 0)
+ found = true;
+ else if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open generator directory %s: %m", *path);
+
+ return found;
+}
+
+static int manager_run_environment_generators(Manager *m) {
+ char **tmp = NULL; /* this is only used in the forked process, no cleanup here */
+ _cleanup_strv_free_ char **paths = NULL;
+ void* args[] = {
+ [STDOUT_GENERATE] = &tmp,
+ [STDOUT_COLLECT] = &tmp,
+ [STDOUT_CONSUME] = &m->transient_environment,
+ };
+ int r;
+
+ if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS))
+ return 0;
+
+ paths = env_generator_binary_paths(MANAGER_IS_SYSTEM(m));
+ if (!paths)
+ return log_oom();
+
+ if (!generator_path_any((const char* const*) paths))
+ return 0;
+
+ RUN_WITH_UMASK(0022)
+ r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment,
+ args, NULL, m->transient_environment,
+ EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+ return r;
+}
+
+static int build_generator_environment(Manager *m, char ***ret) {
+ _cleanup_strv_free_ char **nl = NULL;
+ Virtualization v;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* Generators oftentimes want to know some basic facts about the environment they run in, in order to
+ * adjust generated units to that. Let's pass down some bits of information that are easy for us to
+ * determine (but a bit harder for generator scripts to determine), as environment variables. */
+
+ nl = strv_copy(m->transient_environment);
+ if (!nl)
+ return -ENOMEM;
+
+ r = strv_env_assign(&nl, "SYSTEMD_SCOPE", MANAGER_IS_SYSTEM(m) ? "system" : "user");
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* Note that $SYSTEMD_IN_INITRD may be used to override the initrd detection in much of our
+ * codebase. This is hence more than purely informational. It will shortcut detection of the
+ * initrd state if generators invoke our own tools. But that's OK, as it would come to the
+ * same results (hopefully). */
+ r = strv_env_assign(&nl, "SYSTEMD_IN_INITRD", one_zero(in_initrd()));
+ if (r < 0)
+ return r;
+
+ if (m->first_boot >= 0) {
+ r = strv_env_assign(&nl, "SYSTEMD_FIRST_BOOT", one_zero(m->first_boot));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ v = detect_virtualization();
+ if (v < 0)
+ log_debug_errno(v, "Failed to detect virtualization, ignoring: %m");
+ else if (v > 0) {
+ const char *s;
+
+ s = strjoina(VIRTUALIZATION_IS_VM(v) ? "vm:" :
+ VIRTUALIZATION_IS_CONTAINER(v) ? "container:" : ":",
+ virtualization_to_string(v));
+
+ r = strv_env_assign(&nl, "SYSTEMD_VIRTUALIZATION", s);
+ if (r < 0)
+ return r;
+ }
+
+ r = strv_env_assign(&nl, "SYSTEMD_ARCHITECTURE", architecture_to_string(uname_architecture()));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(nl);
+ return 0;
+}
+
+static int manager_run_generators(Manager *m) {
+ _cleanup_strv_free_ char **paths = NULL, **ge = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS))
+ return 0;
+
+ paths = generator_binary_paths(m->unit_file_scope);
+ if (!paths)
+ return log_oom();
+
+ if (!generator_path_any((const char* const*) paths))
+ return 0;
+
+ r = lookup_paths_mkdir_generator(&m->lookup_paths);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create generator directories: %m");
+ goto finish;
+ }
+
+ const char *argv[] = {
+ NULL, /* Leave this empty, execute_directory() will fill something in */
+ m->lookup_paths.generator,
+ m->lookup_paths.generator_early,
+ m->lookup_paths.generator_late,
+ NULL,
+ };
+
+ r = build_generator_environment(m, &ge);
+ if (r < 0) {
+ log_error_errno(r, "Failed to build generator environment: %m");
+ goto finish;
+ }
+
+ RUN_WITH_UMASK(0022)
+ (void) execute_directories(
+ (const char* const*) paths,
+ DEFAULT_TIMEOUT_USEC,
+ /* callbacks= */ NULL, /* callback_args= */ NULL,
+ (char**) argv,
+ ge,
+ EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+
+ r = 0;
+
+finish:
+ lookup_paths_trim_generator(&m->lookup_paths);
+ return r;
+}
+
+int manager_transient_environment_add(Manager *m, char **plus) {
+ char **a;
+
+ assert(m);
+
+ if (strv_isempty(plus))
+ return 0;
+
+ a = strv_env_merge(m->transient_environment, plus);
+ if (!a)
+ return log_oom();
+
+ sanitize_environment(a);
+
+ return strv_free_and_replace(m->transient_environment, a);
+}
+
+int manager_client_environment_modify(
+ Manager *m,
+ char **minus,
+ char **plus) {
+
+ char **a = NULL, **b = NULL, **l;
+
+ assert(m);
+
+ if (strv_isempty(minus) && strv_isempty(plus))
+ return 0;
+
+ l = m->client_environment;
+
+ if (!strv_isempty(minus)) {
+ a = strv_env_delete(l, 1, minus);
+ if (!a)
+ return -ENOMEM;
+
+ l = a;
+ }
+
+ if (!strv_isempty(plus)) {
+ b = strv_env_merge(l, plus);
+ if (!b) {
+ strv_free(a);
+ return -ENOMEM;
+ }
+
+ l = b;
+ }
+
+ if (m->client_environment != l)
+ strv_free(m->client_environment);
+
+ if (a != l)
+ strv_free(a);
+ if (b != l)
+ strv_free(b);
+
+ m->client_environment = sanitize_environment(l);
+ return 0;
+}
+
+int manager_get_effective_environment(Manager *m, char ***ret) {
+ char **l;
+
+ assert(m);
+ assert(ret);
+
+ l = strv_env_merge(m->transient_environment, m->client_environment);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ return 0;
+}
+
+int manager_set_default_smack_process_label(Manager *m, const char *label) {
+ assert(m);
+
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+ if (!label)
+ return free_and_strdup(&m->default_smack_process_label, SMACK_DEFAULT_PROCESS_LABEL);
+#endif
+ if (streq_ptr(label, "/"))
+ return free_and_strdup(&m->default_smack_process_label, NULL);
+
+ return free_and_strdup(&m->default_smack_process_label, label);
+}
+
+int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit) {
+ assert(m);
+
+ for (unsigned i = 0; i < _RLIMIT_MAX; i++) {
+ m->rlimit[i] = mfree(m->rlimit[i]);
+
+ if (!default_rlimit[i])
+ continue;
+
+ m->rlimit[i] = newdup(struct rlimit, default_rlimit[i], 1);
+ if (!m->rlimit[i])
+ return log_oom();
+ }
+
+ return 0;
+}
+
+void manager_recheck_dbus(Manager *m) {
+ assert(m);
+
+ /* Connects to the bus if the dbus service and socket are running. If we are running in user mode
+ * this is all it does. In system mode we'll also connect to the system bus (which will most likely
+ * just reuse the connection of the API bus). That's because the system bus after all runs as service
+ * of the system instance, while in the user instance we can assume it's already there. */
+
+ if (MANAGER_IS_RELOADING(m))
+ return; /* don't check while we are reloading… */
+
+ if (manager_dbus_is_running(m, false)) {
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ } else {
+ (void) bus_done_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_done_system(m);
+ }
+}
+
+static bool manager_journal_is_running(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return false;
+
+ /* If we are the user manager we can safely assume that the journal is up */
+ if (!MANAGER_IS_SYSTEM(m))
+ return true;
+
+ /* Check that the socket is not only up, but in RUNNING state */
+ u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET);
+ if (!u)
+ return false;
+ if (SOCKET(u)->state != SOCKET_RUNNING)
+ return false;
+
+ /* Similar, check if the daemon itself is fully up, too */
+ u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE);
+ if (!u)
+ return false;
+ if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING))
+ return false;
+
+ return true;
+}
+
+void disable_printk_ratelimit(void) {
+ /* Disable kernel's printk ratelimit.
+ *
+ * Logging to /dev/kmsg is most useful during early boot and shutdown, where normal logging
+ * mechanisms are not available. The semantics of this sysctl are such that any kernel command-line
+ * setting takes precedence. */
+ int r;
+
+ r = sysctl_write("kernel/printk_devkmsg", "on");
+ if (r < 0)
+ log_debug_errno(r, "Failed to set sysctl kernel.printk_devkmsg=on: %m");
+}
+
+void manager_recheck_journal(Manager *m) {
+
+ assert(m);
+
+ /* Don't bother with this unless we are in the special situation of being PID 1 */
+ if (getpid_cached() != 1)
+ return;
+
+ /* Don't check this while we are reloading, things might still change */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If
+ * the journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we
+ * might trigger an activation ourselves we can't fulfill. */
+ log_set_prohibit_ipc(!manager_journal_is_running(m));
+ log_open();
+}
+
+static ShowStatus manager_get_show_status(Manager *m) {
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return _SHOW_STATUS_INVALID;
+
+ if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+ return m->show_status_overridden;
+
+ return m->show_status;
+}
+
+bool manager_get_show_status_on(Manager *m) {
+ assert(m);
+
+ return show_status_on(manager_get_show_status(m));
+}
+
+static void set_show_status_marker(bool b) {
+ if (b)
+ (void) touch("/run/systemd/show-status");
+ else
+ (void) unlink("/run/systemd/show-status");
+}
+
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason) {
+ assert(m);
+ assert(reason);
+ assert(mode >= 0 && mode < _SHOW_STATUS_MAX);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (mode == m->show_status)
+ return;
+
+ if (m->show_status_overridden == _SHOW_STATUS_INVALID) {
+ bool enabled;
+
+ enabled = show_status_on(mode);
+ log_debug("%s (%s) showing of status (%s).",
+ enabled ? "Enabling" : "Disabling",
+ strna(show_status_to_string(mode)),
+ reason);
+
+ set_show_status_marker(enabled);
+ }
+
+ m->show_status = mode;
+}
+
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason) {
+ assert(m);
+ assert(mode < _SHOW_STATUS_MAX);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (mode == m->show_status_overridden)
+ return;
+
+ m->show_status_overridden = mode;
+
+ if (mode == _SHOW_STATUS_INVALID)
+ mode = m->show_status;
+
+ log_debug("%s (%s) showing of status (%s).",
+ m->show_status_overridden != _SHOW_STATUS_INVALID ? "Overriding" : "Restoring",
+ strna(show_status_to_string(mode)),
+ reason);
+
+ set_show_status_marker(show_status_on(mode));
+}
+
+const char *manager_get_confirm_spawn(Manager *m) {
+ static int last_errno = 0;
+ struct stat st;
+ int r;
+
+ assert(m);
+
+ /* Here's the deal: we want to test the validity of the console but don't want
+ * PID1 to go through the whole console process which might block. But we also
+ * want to warn the user only once if something is wrong with the console so we
+ * cannot do the sanity checks after spawning our children. So here we simply do
+ * really basic tests to hopefully trap common errors.
+ *
+ * If the console suddenly disappear at the time our children will really it
+ * then they will simply fail to acquire it and a positive answer will be
+ * assumed. New children will fall back to /dev/console though.
+ *
+ * Note: TTYs are devices that can come and go any time, and frequently aren't
+ * available yet during early boot (consider a USB rs232 dongle...). If for any
+ * reason the configured console is not ready, we fall back to the default
+ * console. */
+
+ if (!m->confirm_spawn || path_equal(m->confirm_spawn, "/dev/console"))
+ return m->confirm_spawn;
+
+ if (stat(m->confirm_spawn, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (!S_ISCHR(st.st_mode)) {
+ r = -ENOTTY;
+ goto fail;
+ }
+
+ last_errno = 0;
+ return m->confirm_spawn;
+
+fail:
+ if (last_errno != r)
+ last_errno = log_warning_errno(r, "Failed to open %s, using default console: %m", m->confirm_spawn);
+
+ return "/dev/console";
+}
+
+void manager_set_first_boot(Manager *m, bool b) {
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (m->first_boot != (int) b) {
+ if (b)
+ (void) touch("/run/systemd/first-boot");
+ else
+ (void) unlink("/run/systemd/first-boot");
+ }
+
+ m->first_boot = b;
+}
+
+void manager_disable_confirm_spawn(void) {
+ (void) touch("/run/systemd/confirm_spawn_disabled");
+}
+
+bool manager_is_confirm_spawn_disabled(Manager *m) {
+ if (!m->confirm_spawn)
+ return true;
+
+ return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
+}
+
+static bool manager_should_show_status(Manager *m, StatusType type) {
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return false;
+
+ if (m->no_console_output)
+ return false;
+
+ if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING))
+ return false;
+
+ /* If we cannot find out the status properly, just proceed. */
+ if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0)
+ return false;
+
+ if (type == STATUS_TYPE_NOTICE && m->show_status != SHOW_STATUS_NO)
+ return true;
+
+ return manager_get_show_status_on(m);
+}
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) {
+ va_list ap;
+
+ /* If m is NULL, assume we're after shutdown and let the messages through. */
+
+ if (m && !manager_should_show_status(m, type))
+ return;
+
+ /* XXX We should totally drop the check for ephemeral here
+ * and thus effectively make 'Type=idle' pointless. */
+ if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0)
+ return;
+
+ va_start(ap, format);
+ status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap);
+ va_end(ap);
+}
+
+Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) {
+ assert(m);
+ assert(path);
+
+ if (path_equal(path, "/"))
+ path = "";
+
+ return hashmap_get(m->units_requiring_mounts_for, path);
+}
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed) {
+ unsigned size;
+ int r;
+
+ assert(m);
+ assert(u->manager == m);
+
+ size = set_size(m->failed_units);
+
+ if (failed) {
+ r = set_ensure_put(&m->failed_units, NULL, u);
+ if (r < 0)
+ return log_oom();
+ } else
+ (void) set_remove(m->failed_units, u);
+
+ if (set_size(m->failed_units) != size)
+ bus_manager_send_change_signal(m);
+
+ return 0;
+}
+
+ManagerState manager_state(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+ u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_STOPPING;
+
+ /* Did we ever finish booting? If not then we are still starting up */
+ if (!MANAGER_IS_FINISHED(m)) {
+
+ u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+ if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return MANAGER_INITIALIZING;
+
+ return MANAGER_STARTING;
+ }
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */
+ u = manager_get_unit(m, SPECIAL_RESCUE_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_MAINTENANCE;
+
+ u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_MAINTENANCE;
+ }
+
+ /* Are there any failed units? If so, we are in degraded mode */
+ if (set_size(m->failed_units) > 0)
+ return MANAGER_DEGRADED;
+
+ return MANAGER_RUNNING;
+}
+
+static void manager_unref_uid_internal(
+ Hashmap *uid_refs,
+ uid_t uid,
+ bool destroy_now,
+ int (*_clean_ipc)(uid_t uid)) {
+
+ uint32_t c, n;
+
+ assert(uid_is_valid(uid));
+ assert(_clean_ipc);
+
+ /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the
+ * assumption that uid_t and gid_t are actually defined the same way, with the same validity rules.
+ *
+ * We store a hashmap where the key is the UID/GID and the value is a 32bit reference counter, whose
+ * highest bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last
+ * reference to the UID/GID is dropped. The flag is set to on, once at least one reference from a
+ * unit where RemoveIPC= is set is added on a UID/GID. It is reset when the UID's/GID's reference
+ * counter drops to 0 again. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (uid == 0) /* We don't keep track of root, and will never destroy it */
+ return;
+
+ c = PTR_TO_UINT32(hashmap_get(uid_refs, UID_TO_PTR(uid)));
+
+ n = c & ~DESTROY_IPC_FLAG;
+ assert(n > 0);
+ n--;
+
+ if (destroy_now && n == 0) {
+ hashmap_remove(uid_refs, UID_TO_PTR(uid));
+
+ if (c & DESTROY_IPC_FLAG) {
+ log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.",
+ _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+ uid);
+ (void) _clean_ipc(uid);
+ }
+ } else {
+ c = n | (c & DESTROY_IPC_FLAG);
+ assert_se(hashmap_update(uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0);
+ }
+}
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) {
+ manager_unref_uid_internal(m->uid_refs, uid, destroy_now, clean_ipc_by_uid);
+}
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) {
+ manager_unref_uid_internal(m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid);
+}
+
+static int manager_ref_uid_internal(
+ Hashmap **uid_refs,
+ uid_t uid,
+ bool clean_ipc) {
+
+ uint32_t c, n;
+ int r;
+
+ assert(uid_refs);
+ assert(uid_is_valid(uid));
+
+ /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the
+ * assumption that uid_t and gid_t are actually defined the same way, with the same validity
+ * rules. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (uid == 0) /* We don't keep track of root, and will never destroy it */
+ return 0;
+
+ r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops);
+ if (r < 0)
+ return r;
+
+ c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+
+ n = c & ~DESTROY_IPC_FLAG;
+ n++;
+
+ if (n & DESTROY_IPC_FLAG) /* check for overflow */
+ return -EOVERFLOW;
+
+ c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0);
+
+ return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+}
+
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) {
+ return manager_ref_uid_internal(&m->uid_refs, uid, clean_ipc);
+}
+
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) {
+ return manager_ref_uid_internal(&m->gid_refs, (uid_t) gid, clean_ipc);
+}
+
+static void manager_vacuum_uid_refs_internal(
+ Hashmap *uid_refs,
+ int (*_clean_ipc)(uid_t uid)) {
+
+ void *p, *k;
+
+ assert(_clean_ipc);
+
+ HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+ uint32_t c, n;
+ uid_t uid;
+
+ uid = PTR_TO_UID(k);
+ c = PTR_TO_UINT32(p);
+
+ n = c & ~DESTROY_IPC_FLAG;
+ if (n > 0)
+ continue;
+
+ if (c & DESTROY_IPC_FLAG) {
+ log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.",
+ _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+ uid);
+ (void) _clean_ipc(uid);
+ }
+
+ assert_se(hashmap_remove(uid_refs, k) == p);
+ }
+}
+
+static void manager_vacuum_uid_refs(Manager *m) {
+ manager_vacuum_uid_refs_internal(m->uid_refs, clean_ipc_by_uid);
+}
+
+static void manager_vacuum_gid_refs(Manager *m) {
+ manager_vacuum_uid_refs_internal(m->gid_refs, clean_ipc_by_gid);
+}
+
+static void manager_vacuum(Manager *m) {
+ assert(m);
+
+ /* Release any dynamic users no longer referenced */
+ dynamic_user_vacuum(m, true);
+
+ /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */
+ manager_vacuum_uid_refs(m);
+ manager_vacuum_gid_refs(m);
+
+ /* Release any runtimes no longer referenced */
+ exec_runtime_vacuum(m);
+}
+
+int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ struct buffer {
+ uid_t uid;
+ gid_t gid;
+ char unit_name[UNIT_NAME_MAX+1];
+ } _packed_ buffer;
+
+ Manager *m = userdata;
+ ssize_t l;
+ size_t n;
+ Unit *u;
+
+ assert_se(source);
+ assert_se(m);
+
+ /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the
+ * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that
+ * it can add a reference to the UID/GID so that it can destroy the UID/GID's IPC objects when the
+ * reference counter drops to 0. */
+
+ l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT);
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read from user lookup fd: %m");
+ }
+
+ if ((size_t) l <= offsetof(struct buffer, unit_name)) {
+ log_warning("Received too short user lookup message, ignoring.");
+ return 0;
+ }
+
+ if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) {
+ log_warning("Received too long user lookup message, ignoring.");
+ return 0;
+ }
+
+ if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) {
+ log_warning("Got user lookup message with invalid UID/GID pair, ignoring.");
+ return 0;
+ }
+
+ n = (size_t) l - offsetof(struct buffer, unit_name);
+ if (memchr(buffer.unit_name, 0, n)) {
+ log_warning("Received lookup message with embedded NUL character, ignoring.");
+ return 0;
+ }
+
+ buffer.unit_name[n] = 0;
+ u = manager_get_unit(m, buffer.unit_name);
+ if (!u) {
+ log_debug("Got user lookup message but unit doesn't exist, ignoring.");
+ return 0;
+ }
+
+ log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid);
+
+ unit_notify_user_lookup(u, buffer.uid, buffer.gid);
+ return 0;
+}
+
+static int short_uid_range(const char *path) {
+ _cleanup_(uid_range_freep) UidRange *p = NULL;
+ int r;
+
+ assert(path);
+
+ /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534,
+ * i.e. from root to nobody. */
+
+ r = uid_range_load_userns(&p, path);
+ if (r < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(r))
+ return false;
+ return log_debug_errno(r, "Failed to load %s: %m", path);
+ }
+
+ return !uid_range_covers(p, 0, 65535);
+}
+
+char* manager_taint_string(const Manager *m) {
+ /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at
+ * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the
+ * configuration phase. */
+
+ assert(m);
+
+ const char* stage[13] = {};
+ size_t n = 0;
+
+ if (m->taint_usr)
+ stage[n++] = "split-usr";
+
+ _cleanup_free_ char *usrbin = NULL;
+ if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin"))
+ stage[n++] = "unmerged-usr";
+
+ if (access("/proc/cgroups", F_OK) < 0)
+ stage[n++] = "cgroups-missing";
+
+ if (cg_all_unified() == 0)
+ stage[n++] = "cgroupsv1";
+
+ if (clock_is_localtime(NULL) > 0)
+ stage[n++] = "local-hwclock";
+
+ if (os_release_support_ended(NULL, true) > 0)
+ stage[n++] = "support-ended";
+
+ _cleanup_free_ char *destination = NULL;
+ if (readlink_malloc("/var/run", &destination) < 0 ||
+ !PATH_IN_SET(destination, "../run", "/run"))
+ stage[n++] = "var-run-bad";
+
+ _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL;
+ if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 &&
+ !streq(overflowuid, "65534"))
+ stage[n++] = "overflowuid-not-65534";
+ if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 &&
+ !streq(overflowgid, "65534"))
+ stage[n++] = "overflowgid-not-65534";
+
+ struct utsname uts;
+ assert_se(uname(&uts) >= 0);
+ if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+ stage[n++] = "old-kernel";
+
+ if (short_uid_range("/proc/self/uid_map") > 0)
+ stage[n++] = "short-uid-range";
+ if (short_uid_range("/proc/self/gid_map") > 0)
+ stage[n++] = "short-gid-range";
+
+ assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */
+
+ return strv_join((char**) stage, ":");
+}
+
+void manager_ref_console(Manager *m) {
+ assert(m);
+
+ m->n_on_console++;
+}
+
+void manager_unref_console(Manager *m) {
+
+ assert(m->n_on_console > 0);
+ m->n_on_console--;
+
+ if (m->n_on_console == 0)
+ m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */
+}
+
+void manager_override_log_level(Manager *m, int level) {
+ _cleanup_free_ char *s = NULL;
+ assert(m);
+
+ if (!m->log_level_overridden) {
+ m->original_log_level = log_get_max_level();
+ m->log_level_overridden = true;
+ }
+
+ (void) log_level_to_string_alloc(level, &s);
+ log_info("Setting log level to %s.", strna(s));
+
+ log_set_max_level(level);
+}
+
+void manager_restore_original_log_level(Manager *m) {
+ _cleanup_free_ char *s = NULL;
+ assert(m);
+
+ if (!m->log_level_overridden)
+ return;
+
+ (void) log_level_to_string_alloc(m->original_log_level, &s);
+ log_info("Restoring log level to original (%s).", strna(s));
+
+ log_set_max_level(m->original_log_level);
+ m->log_level_overridden = false;
+}
+
+void manager_override_log_target(Manager *m, LogTarget target) {
+ assert(m);
+
+ if (!m->log_target_overridden) {
+ m->original_log_target = log_get_target();
+ m->log_target_overridden = true;
+ }
+
+ log_info("Setting log target to %s.", log_target_to_string(target));
+ log_set_target(target);
+}
+
+void manager_restore_original_log_target(Manager *m) {
+ assert(m);
+
+ if (!m->log_target_overridden)
+ return;
+
+ log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target));
+
+ log_set_target(m->original_log_target);
+ m->log_target_overridden = false;
+}
+
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) {
+ if (in_initrd() &&
+ s >= MANAGER_TIMESTAMP_SECURITY_START &&
+ s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)
+ return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START;
+ return s;
+}
+
+static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
+ [MANAGER_INITIALIZING] = "initializing",
+ [MANAGER_STARTING] = "starting",
+ [MANAGER_RUNNING] = "running",
+ [MANAGER_DEGRADED] = "degraded",
+ [MANAGER_MAINTENANCE] = "maintenance",
+ [MANAGER_STOPPING] = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState);
+
+static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
+ [MANAGER_TIMESTAMP_FIRMWARE] = "firmware",
+ [MANAGER_TIMESTAMP_LOADER] = "loader",
+ [MANAGER_TIMESTAMP_KERNEL] = "kernel",
+ [MANAGER_TIMESTAMP_INITRD] = "initrd",
+ [MANAGER_TIMESTAMP_USERSPACE] = "userspace",
+ [MANAGER_TIMESTAMP_FINISH] = "finish",
+ [MANAGER_TIMESTAMP_SECURITY_START] = "security-start",
+ [MANAGER_TIMESTAMP_SECURITY_FINISH] = "security-finish",
+ [MANAGER_TIMESTAMP_GENERATORS_START] = "generators-start",
+ [MANAGER_TIMESTAMP_GENERATORS_FINISH] = "generators-finish",
+ [MANAGER_TIMESTAMP_UNITS_LOAD_START] = "units-load-start",
+ [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH] = "units-load-finish",
+ [MANAGER_TIMESTAMP_UNITS_LOAD] = "units-load",
+ [MANAGER_TIMESTAMP_INITRD_SECURITY_START] = "initrd-security-start",
+ [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH] = "initrd-security-finish",
+ [MANAGER_TIMESTAMP_INITRD_GENERATORS_START] = "initrd-generators-start",
+ [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish",
+ [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start",
+ [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
+
+static const char* const oom_policy_table[_OOM_POLICY_MAX] = {
+ [OOM_CONTINUE] = "continue",
+ [OOM_STOP] = "stop",
+ [OOM_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy);
diff --git a/src/core/manager.h b/src/core/manager.h
new file mode 100644
index 0000000..3a817b1
--- /dev/null
+++ b/src/core/manager.h
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "sd-bus.h"
+#include "sd-device.h"
+#include "sd-event.h"
+
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "fdset.h"
+#include "hashmap.h"
+#include "list.h"
+#include "prioq.h"
+#include "ratelimit.h"
+#include "varlink.h"
+
+struct libmnt_monitor;
+typedef struct Unit Unit;
+
+/* Enforce upper limit how many names we allow */
+#define MANAGER_MAX_NAMES 131072 /* 128K */
+
+typedef struct Manager Manager;
+
+/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields
+ * when requested */
+typedef enum ManagerState {
+ MANAGER_INITIALIZING,
+ MANAGER_STARTING,
+ MANAGER_RUNNING,
+ MANAGER_DEGRADED,
+ MANAGER_MAINTENANCE,
+ MANAGER_STOPPING,
+ _MANAGER_STATE_MAX,
+ _MANAGER_STATE_INVALID = -EINVAL,
+} ManagerState;
+
+typedef enum ManagerObjective {
+ MANAGER_OK,
+ MANAGER_EXIT,
+ MANAGER_RELOAD,
+ MANAGER_REEXECUTE,
+ MANAGER_REBOOT,
+ MANAGER_POWEROFF,
+ MANAGER_HALT,
+ MANAGER_KEXEC,
+ MANAGER_SWITCH_ROOT,
+ _MANAGER_OBJECTIVE_MAX,
+ _MANAGER_OBJECTIVE_INVALID = -EINVAL,
+} ManagerObjective;
+
+typedef enum StatusType {
+ STATUS_TYPE_EPHEMERAL,
+ STATUS_TYPE_NORMAL,
+ STATUS_TYPE_NOTICE,
+ STATUS_TYPE_EMERGENCY,
+} StatusType;
+
+typedef enum OOMPolicy {
+ OOM_CONTINUE, /* The kernel or systemd-oomd kills the process it wants to kill, and that's it */
+ OOM_STOP, /* The kernel or systemd-oomd kills the process it wants to kill, and we stop the unit */
+ OOM_KILL, /* The kernel or systemd-oomd kills the process it wants to kill, and all others in the unit, and we stop the unit */
+ _OOM_POLICY_MAX,
+ _OOM_POLICY_INVALID = -EINVAL,
+} OOMPolicy;
+
+/* Notes:
+ * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD,
+ * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when
+ * the manager is system and not running under container environment.
+ *
+ * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero.
+ *
+ * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not
+ * have RTC.
+ *
+ * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not
+ * have RTC, or systemd is built without EFI support.
+ *
+ * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as
+ * negative of the actual value.
+ *
+ * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started.
+ *
+ * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd.
+ */
+
+typedef enum ManagerTimestamp {
+ MANAGER_TIMESTAMP_FIRMWARE,
+ MANAGER_TIMESTAMP_LOADER,
+ MANAGER_TIMESTAMP_KERNEL,
+ MANAGER_TIMESTAMP_INITRD,
+ MANAGER_TIMESTAMP_USERSPACE,
+ MANAGER_TIMESTAMP_FINISH,
+
+ MANAGER_TIMESTAMP_SECURITY_START,
+ MANAGER_TIMESTAMP_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_GENERATORS_START,
+ MANAGER_TIMESTAMP_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD_START,
+ MANAGER_TIMESTAMP_UNITS_LOAD_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD,
+
+ MANAGER_TIMESTAMP_INITRD_SECURITY_START,
+ MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_INITRD_GENERATORS_START,
+ MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START,
+ MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH,
+ _MANAGER_TIMESTAMP_MAX,
+ _MANAGER_TIMESTAMP_INVALID = -EINVAL,
+} ManagerTimestamp;
+
+typedef enum WatchdogType {
+ WATCHDOG_RUNTIME,
+ WATCHDOG_REBOOT,
+ WATCHDOG_KEXEC,
+ WATCHDOG_PRETIMEOUT,
+ _WATCHDOG_TYPE_MAX,
+} WatchdogType;
+
+#include "execute.h"
+#include "job.h"
+#include "path-lookup.h"
+#include "show-status.h"
+#include "unit-name.h"
+
+typedef enum ManagerTestRunFlags {
+ MANAGER_TEST_NORMAL = 0, /* run normally */
+ MANAGER_TEST_RUN_MINIMAL = 1 << 0, /* create basic data structures */
+ MANAGER_TEST_RUN_BASIC = 1 << 1, /* interact with the environment */
+ MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */
+ MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */
+ MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4, /* run while ignoring dependencies */
+ MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS,
+} ManagerTestRunFlags;
+
+assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL);
+
+struct Manager {
+ /* Note that the set of units we know of is allowed to be
+ * inconsistent. However the subset of it that is loaded may
+ * not, and the list of jobs may neither. */
+
+ /* Active jobs and units */
+ Hashmap *units; /* name string => Unit object n:1 */
+ Hashmap *units_by_invocation_id;
+ Hashmap *jobs; /* job id => Job object 1:1 */
+
+ /* To make it easy to iterate through the units of a specific
+ * type we maintain a per type linked list */
+ LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]);
+
+ /* Units that need to be loaded */
+ LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */
+
+ /* Jobs that need to be run */
+ struct Prioq *run_queue;
+
+ /* Units and jobs that have not yet been announced via
+ * D-Bus. When something about a job changes it is added here
+ * if it is not in there yet. This allows easy coalescing of
+ * D-Bus change signals. */
+ LIST_HEAD(Unit, dbus_unit_queue);
+ LIST_HEAD(Job, dbus_job_queue);
+
+ /* Units to remove */
+ LIST_HEAD(Unit, cleanup_queue);
+
+ /* Units and jobs to check when doing GC */
+ LIST_HEAD(Unit, gc_unit_queue);
+ LIST_HEAD(Job, gc_job_queue);
+
+ /* Units that should be realized */
+ LIST_HEAD(Unit, cgroup_realize_queue);
+
+ /* Units whose cgroup ran empty */
+ LIST_HEAD(Unit, cgroup_empty_queue);
+
+ /* Units whose memory.event fired */
+ LIST_HEAD(Unit, cgroup_oom_queue);
+
+ /* Target units whose default target dependencies haven't been set yet */
+ LIST_HEAD(Unit, target_deps_queue);
+
+ /* Units that might be subject to StopWhenUnneeded= clean-up */
+ LIST_HEAD(Unit, stop_when_unneeded_queue);
+
+ /* Units which are upheld by another other which we might need to act on */
+ LIST_HEAD(Unit, start_when_upheld_queue);
+
+ /* Units that have BindsTo= another unit, and might need to be shutdown because the bound unit is not active. */
+ LIST_HEAD(Unit, stop_when_bound_queue);
+
+ sd_event *event;
+
+ /* This maps PIDs we care about to units that are interested in. We allow multiple units to be interested in
+ * the same PID and multiple PIDs to be relevant to the same unit. Since in most cases only a single unit will
+ * be interested in the same PID we use a somewhat special encoding here: the first unit interested in a PID is
+ * stored directly in the hashmap, keyed by the PID unmodified. If there are other units interested too they'll
+ * be stored in a NULL-terminated array, and keyed by the negative PID. This is safe as pid_t is signed and
+ * negative PIDs are not used for regular processes but process groups, which we don't care about in this
+ * context, but this allows us to use the negative range for our own purposes. */
+ Hashmap *watch_pids; /* pid => unit as well as -pid => array of units */
+
+ /* A set contains all units which cgroup should be refreshed after startup */
+ Set *startup_units;
+
+ /* A set which contains all currently failed units */
+ Set *failed_units;
+
+ sd_event_source *run_queue_event_source;
+
+ char *notify_socket;
+ int notify_fd;
+ sd_event_source *notify_event_source;
+
+ int cgroups_agent_fd;
+ sd_event_source *cgroups_agent_event_source;
+
+ int signal_fd;
+ sd_event_source *signal_event_source;
+
+ sd_event_source *sigchld_event_source;
+
+ sd_event_source *time_change_event_source;
+
+ sd_event_source *timezone_change_event_source;
+
+ sd_event_source *jobs_in_progress_event_source;
+
+ int user_lookup_fds[2];
+ sd_event_source *user_lookup_event_source;
+
+ LookupScope unit_file_scope;
+ LookupPaths lookup_paths;
+ Hashmap *unit_id_map;
+ Hashmap *unit_name_map;
+ Set *unit_path_cache;
+ uint64_t unit_cache_timestamp_hash;
+
+ char **transient_environment; /* The environment, as determined from config files, kernel cmdline and environment generators */
+ char **client_environment; /* Environment variables created by clients through the bus API */
+
+ usec_t watchdog[_WATCHDOG_TYPE_MAX];
+ usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX];
+ char *watchdog_pretimeout_governor;
+ char *watchdog_pretimeout_governor_overridden;
+
+ dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX];
+
+ /* Data specific to the device subsystem */
+ sd_device_monitor *device_monitor;
+ Hashmap *devices_by_sysfs;
+
+ /* Data specific to the mount subsystem */
+ struct libmnt_monitor *mount_monitor;
+ sd_event_source *mount_event_source;
+
+ /* Data specific to the swap filesystem */
+ FILE *proc_swaps;
+ sd_event_source *swap_event_source;
+ Hashmap *swaps_by_devnode;
+
+ /* Data specific to the D-Bus subsystem */
+ sd_bus *api_bus, *system_bus;
+ Set *private_buses;
+ int private_listen_fd;
+ sd_event_source *private_listen_event_source;
+
+ /* Contains all the clients that are subscribed to signals via
+ the API bus. Note that private bus connections are always
+ considered subscribes, since they last for very short only,
+ and it is much simpler that way. */
+ sd_bus_track *subscribed;
+ char **deserialized_subscribed;
+
+ /* This is used during reloading: before the reload we queue
+ * the reply message here, and afterwards we send it */
+ sd_bus_message *pending_reload_message;
+
+ Hashmap *watch_bus; /* D-Bus names => Unit object n:1 */
+
+ bool send_reloading_done;
+
+ uint32_t current_job_id;
+ uint32_t default_unit_job_id;
+
+ /* Data specific to the Automount subsystem */
+ int dev_autofs_fd;
+
+ /* Data specific to the cgroup subsystem */
+ Hashmap *cgroup_unit;
+ CGroupMask cgroup_supported;
+ char *cgroup_root;
+
+ /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
+ int cgroup_inotify_fd;
+ sd_event_source *cgroup_inotify_event_source;
+
+ /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and
+ * memory.events cgroupv2 attributes. */
+ Hashmap *cgroup_control_inotify_wd_unit;
+ Hashmap *cgroup_memory_inotify_wd_unit;
+
+ /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
+ sd_event_source *cgroup_empty_event_source;
+ sd_event_source *cgroup_oom_event_source;
+
+ /* Make sure the user cannot accidentally unmount our cgroup
+ * file system */
+ int pin_cgroupfs_fd;
+
+ unsigned gc_marker;
+
+ /* The stat() data the last time we saw /etc/localtime */
+ usec_t etc_localtime_mtime;
+ bool etc_localtime_accessible;
+
+ ManagerObjective objective;
+
+ /* Flags */
+ bool dispatching_load_queue;
+
+ bool taint_usr;
+
+ /* Have we already sent out the READY=1 notification? */
+ bool ready_sent;
+
+ /* Was the last status sent "STATUS=Ready."? */
+ bool status_ready;
+
+ /* Have we already printed the taint line if necessary? */
+ bool taint_logged;
+
+ /* Have we ever changed the "kernel.pid_max" sysctl? */
+ bool sysctl_pid_max_changed;
+
+ ManagerTestRunFlags test_run_flags;
+
+ /* If non-zero, exit with the following value when the systemd
+ * process terminate. Useful for containers: systemd-nspawn could get
+ * the return value. */
+ uint8_t return_value;
+
+ ShowStatus show_status;
+ ShowStatus show_status_overridden;
+ StatusUnitFormat status_unit_format;
+ char *confirm_spawn;
+ bool no_console_output;
+ bool service_watchdogs;
+
+ ExecOutput default_std_output, default_std_error;
+
+ usec_t default_restart_usec, default_timeout_start_usec, default_timeout_stop_usec;
+ usec_t default_device_timeout_usec;
+ usec_t default_timeout_abort_usec;
+ bool default_timeout_abort_set;
+
+ usec_t default_start_limit_interval;
+ unsigned default_start_limit_burst;
+
+ bool default_cpu_accounting;
+ bool default_memory_accounting;
+ bool default_io_accounting;
+ bool default_blockio_accounting;
+ bool default_tasks_accounting;
+ bool default_ip_accounting;
+
+ TasksMax default_tasks_max;
+ usec_t default_timer_accuracy_usec;
+
+ OOMPolicy default_oom_policy;
+ int default_oom_score_adjust;
+ bool default_oom_score_adjust_set;
+
+ int original_log_level;
+ LogTarget original_log_target;
+ bool log_level_overridden;
+ bool log_target_overridden;
+
+ struct rlimit *rlimit[_RLIMIT_MAX];
+
+ /* non-zero if we are reloading or reexecuting, */
+ int n_reloading;
+
+ unsigned n_installed_jobs;
+ unsigned n_failed_jobs;
+
+ /* Jobs in progress watching */
+ unsigned n_running_jobs;
+ unsigned n_on_console;
+ unsigned jobs_in_progress_iteration;
+
+ /* Do we have any outstanding password prompts? */
+ int have_ask_password;
+ int ask_password_inotify_fd;
+ sd_event_source *ask_password_event_source;
+
+ /* Type=idle pipes */
+ int idle_pipe[4];
+ sd_event_source *idle_pipe_event_source;
+
+ char *switch_root;
+ char *switch_root_init;
+
+ /* This is true before and after switching root. */
+ bool switching_root;
+
+ /* This maps all possible path prefixes to the units needing
+ * them. It's a hashmap with a path string as key and a Set as
+ * value where Unit objects are contained. */
+ Hashmap *units_requiring_mounts_for;
+
+ /* Used for processing polkit authorization responses */
+ Hashmap *polkit_registry;
+
+ /* Dynamic users/groups, indexed by their name */
+ Hashmap *dynamic_users;
+
+ /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */
+ Hashmap *uid_refs;
+ Hashmap *gid_refs;
+
+ /* ExecRuntime, indexed by their owner unit id */
+ Hashmap *exec_runtime_by_id;
+
+ /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */
+ RateLimit ctrl_alt_del_ratelimit;
+ EmergencyAction cad_burst_action;
+
+ const char *unit_log_field;
+ const char *unit_log_format_string;
+
+ const char *invocation_log_field;
+ const char *invocation_log_format_string;
+
+ int first_boot; /* tri-state */
+
+ /* Prefixes of e.g. RuntimeDirectory= */
+ char *prefix[_EXEC_DIRECTORY_TYPE_MAX];
+ char *received_credentials_directory;
+ char *received_encrypted_credentials_directory;
+
+ /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event
+ * multiple times on the same unit. */
+ unsigned sigchldgen;
+ unsigned notifygen;
+
+ VarlinkServer *varlink_server;
+ /* When we're a system manager, this object manages the subscription from systemd-oomd to PID1 that's
+ * used to report changes in ManagedOOM settings (systemd server - oomd client). When
+ * we're a user manager, this object manages the client connection from the user manager to
+ * systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */
+ Varlink *managed_oom_varlink;
+
+ /* Reference to RestrictFileSystems= BPF program */
+ struct restrict_fs_bpf *restrict_fs;
+
+ char *default_smack_process_label;
+
+ /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
+ RateLimit dump_ratelimit;
+};
+
+static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
+ assert(m);
+ return m->default_timeout_abort_set ? m->default_timeout_abort_usec : m->default_timeout_stop_usec;
+}
+
+#define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == LOOKUP_SCOPE_SYSTEM)
+#define MANAGER_IS_USER(m) ((m)->unit_file_scope != LOOKUP_SCOPE_SYSTEM)
+
+#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0)
+
+#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH))
+
+/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */
+#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK)
+
+#define MANAGER_IS_SWITCHING_ROOT(m) ((m)->switching_root)
+
+#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0)
+
+int manager_new(LookupScope scope, ManagerTestRunFlags test_run_flags, Manager **m);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root);
+
+Job *manager_get_job(Manager *m, uint32_t id);
+Unit *manager_get_unit(Manager *m, const char *name);
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j);
+
+bool manager_unit_cache_should_retry_load(Unit *u);
+int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret);
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u);
+
+int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret);
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
+
+void manager_clear_jobs(Manager *m);
+
+void manager_unwatch_pid(Manager *m, pid_t pid);
+
+unsigned manager_dispatch_load_queue(Manager *m);
+
+int manager_default_environment(Manager *m);
+int manager_transient_environment_add(Manager *m, char **plus);
+int manager_client_environment_modify(Manager *m, char **minus, char **plus);
+int manager_get_effective_environment(Manager *m, char ***ret);
+
+int manager_set_default_smack_process_label(Manager *m, const char *label);
+
+int manager_set_default_rlimits(Manager *m, struct rlimit **default_rlimit);
+
+void manager_trigger_run_queue(Manager *m);
+
+int manager_loop(Manager *m);
+
+int manager_reload(Manager *m);
+Manager* manager_reloading_start(Manager *m);
+void manager_reloading_stopp(Manager **m);
+
+void manager_reset_failed(Manager *m);
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success);
+void manager_send_unit_plymouth(Manager *m, Unit *u);
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name);
+
+void manager_check_finished(Manager *m);
+
+void disable_printk_ratelimit(void);
+void manager_recheck_dbus(Manager *m);
+void manager_recheck_journal(Manager *m);
+
+bool manager_get_show_status_on(Manager *m);
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason);
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason);
+
+void manager_set_first_boot(Manager *m, bool b);
+void manager_set_switching_root(Manager *m, bool switching_root);
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5);
+
+Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path);
+
+ManagerState manager_state(Manager *m);
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed);
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now);
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc);
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now);
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc);
+
+char* manager_taint_string(const Manager *m);
+
+void manager_ref_console(Manager *m);
+void manager_unref_console(Manager *m);
+
+void manager_override_log_level(Manager *m, int level);
+void manager_restore_original_log_level(Manager *m);
+
+void manager_override_log_target(Manager *m, LogTarget target);
+void manager_restore_original_log_target(Manager *m);
+
+const char *manager_state_to_string(ManagerState m) _const_;
+ManagerState manager_state_from_string(const char *s) _pure_;
+
+const char *manager_get_confirm_spawn(Manager *m);
+bool manager_is_confirm_spawn_disabled(Manager *m);
+void manager_disable_confirm_spawn(void);
+
+const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
+ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t);
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor);
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor);
+
+const char* oom_policy_to_string(OOMPolicy i) _const_;
+OOMPolicy oom_policy_from_string(const char *s) _pure_;
diff --git a/src/core/meson.build b/src/core/meson.build
new file mode 100644
index 0000000..6fc9089
--- /dev/null
+++ b/src/core/meson.build
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+libcore_sources = files(
+ 'apparmor-setup.c',
+ 'apparmor-setup.h',
+ 'audit-fd.c',
+ 'audit-fd.h',
+ 'automount.c',
+ 'automount.h',
+ 'bpf-devices.c',
+ 'bpf-devices.h',
+ 'bpf-firewall.c',
+ 'bpf-firewall.h',
+ 'bpf-foreign.c',
+ 'bpf-foreign.h',
+ 'bpf-lsm.c',
+ 'bpf-lsm.h',
+ 'bpf-socket-bind.c',
+ 'bpf-socket-bind.h',
+ 'cgroup.c',
+ 'cgroup.h',
+ 'core-varlink.c',
+ 'core-varlink.h',
+ 'dbus-automount.c',
+ 'dbus-automount.h',
+ 'dbus-cgroup.c',
+ 'dbus-cgroup.h',
+ 'dbus-device.c',
+ 'dbus-device.h',
+ 'dbus-execute.c',
+ 'dbus-execute.h',
+ 'dbus-job.c',
+ 'dbus-job.h',
+ 'dbus-kill.c',
+ 'dbus-kill.h',
+ 'dbus-manager.c',
+ 'dbus-manager.h',
+ 'dbus-mount.c',
+ 'dbus-mount.h',
+ 'dbus-path.c',
+ 'dbus-path.h',
+ 'dbus-scope.c',
+ 'dbus-scope.h',
+ 'dbus-service.c',
+ 'dbus-service.h',
+ 'dbus-slice.c',
+ 'dbus-slice.h',
+ 'dbus-socket.c',
+ 'dbus-socket.h',
+ 'dbus-swap.c',
+ 'dbus-swap.h',
+ 'dbus-target.c',
+ 'dbus-target.h',
+ 'dbus-timer.c',
+ 'dbus-timer.h',
+ 'dbus-unit.c',
+ 'dbus-unit.h',
+ 'dbus-util.c',
+ 'dbus-util.h',
+ 'dbus.c',
+ 'dbus.h',
+ 'device.c',
+ 'device.h',
+ 'dynamic-user.c',
+ 'dynamic-user.h',
+ 'efi-random.c',
+ 'efi-random.h',
+ 'emergency-action.c',
+ 'emergency-action.h',
+ 'execute.c',
+ 'execute.h',
+ 'generator-setup.c',
+ 'generator-setup.h',
+ 'ima-setup.c',
+ 'ima-setup.h',
+ 'import-creds.c',
+ 'import-creds.h',
+ 'job.c',
+ 'job.h',
+ 'kill.c',
+ 'kill.h',
+ 'kmod-setup.c',
+ 'kmod-setup.h',
+ 'load-dropin.c',
+ 'load-dropin.h',
+ 'load-fragment.c',
+ 'load-fragment.h',
+ 'manager-dump.c',
+ 'manager-dump.h',
+ 'manager-serialize.c',
+ 'manager-serialize.h',
+ 'manager.c',
+ 'manager.h',
+ 'mount.c',
+ 'mount.h',
+ 'namespace.c',
+ 'namespace.h',
+ 'path.c',
+ 'path.h',
+ 'restrict-ifaces.c',
+ 'restrict-ifaces.h',
+ 'scope.c',
+ 'scope.h',
+ 'selinux-access.c',
+ 'selinux-access.h',
+ 'selinux-setup.c',
+ 'selinux-setup.h',
+ 'service.c',
+ 'service.h',
+ 'show-status.c',
+ 'show-status.h',
+ 'slice.c',
+ 'slice.h',
+ 'smack-setup.c',
+ 'smack-setup.h',
+ 'socket.c',
+ 'socket.h',
+ 'swap.c',
+ 'swap.h',
+ 'target.c',
+ 'target.h',
+ 'timer.c',
+ 'timer.h',
+ 'transaction.c',
+ 'transaction.h',
+ 'unit-dependency-atom.c',
+ 'unit-dependency-atom.h',
+ 'unit-printf.c',
+ 'unit-printf.h',
+ 'unit-serialize.c',
+ 'unit-serialize.h',
+ 'unit.c',
+ 'unit.h',
+)
+
+if conf.get('BPF_FRAMEWORK') == 1
+ libcore_sources += files(
+ 'bpf-util.c',
+ 'bpf-util.h',
+ )
+endif
+
+subdir('bpf')
+
+subdir('bpf/socket_bind')
+if conf.get('BPF_FRAMEWORK') == 1
+ libcore_sources += [socket_bind_skel_h]
+ subdir('bpf/restrict_fs')
+ libcore_sources += [restrict_fs_skel_h]
+endif
+
+subdir('bpf/restrict_ifaces')
+if conf.get('BPF_FRAMEWORK') == 1
+ libcore_sources += [restrict_ifaces_skel_h]
+endif
+
+load_fragment_gperf_gperf = custom_target(
+ 'load-fragment-gperf.gperf',
+ input : 'load-fragment-gperf.gperf.in',
+ output: 'load-fragment-gperf.gperf',
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'])
+
+load_fragment_gperf_c = custom_target(
+ 'load-fragment-gperf.c',
+ input : load_fragment_gperf_gperf,
+ output : 'load-fragment-gperf.c',
+ command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@'])
+
+awkscript = 'load-fragment-gperf-nulstr.awk'
+load_fragment_gperf_nulstr_c = custom_target(
+ 'load-fragment-gperf-nulstr.c',
+ input : [awkscript, load_fragment_gperf_gperf],
+ output : 'load-fragment-gperf-nulstr.c',
+ command : [awk, '-f', '@INPUT0@', '@INPUT1@'],
+ capture : true)
+
+libcore_name = 'systemd-core-@0@'.format(shared_lib_tag)
+
+libcore = shared_library(
+ libcore_name,
+ libcore_sources,
+ load_fragment_gperf_c,
+ load_fragment_gperf_nulstr_c,
+ include_directories : includes,
+ c_args : ['-fvisibility=default'],
+ link_args : ['-shared',
+ '-Wl,--version-script=' + libshared_sym_path],
+ link_with : libshared,
+ dependencies : [versiondep,
+ threads,
+ libdl,
+ librt,
+ libseccomp,
+ libpam,
+ libaudit,
+ libkmod,
+ libapparmor,
+ libselinux,
+ libmount,
+ libblkid,
+ libacl],
+ install : true,
+ install_dir : rootpkglibdir)
+
+core_includes = [includes, include_directories('.')]
+
+systemd_sources = files(
+ 'main.c',
+ 'main.h',
+ 'crash-handler.c',
+ 'crash-handler.h',
+)
+
+in_files = [['system.conf', pkgsysconfdir],
+ ['user.conf', pkgsysconfdir],
+ ['systemd.pc', pkgconfigdatadir],
+ ['org.freedesktop.systemd1.policy', polkitpolicydir]]
+
+foreach item : in_files
+ file = item[0]
+ dir = item[1]
+
+ custom_target(
+ file,
+ input : file + '.in',
+ output: file,
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+ install : (dir == pkgsysconfdir) ? install_sysconfdir_samples : (dir != 'no'),
+ install_dir : dir)
+endforeach
+
+install_data('org.freedesktop.systemd1.conf',
+ install_dir : dbuspolicydir)
+install_data('org.freedesktop.systemd1.service',
+ install_dir : dbussystemservicedir)
+
+meson.add_install_script('sh', '-c', mkdir_p.format(systemshutdowndir))
+meson.add_install_script('sh', '-c', mkdir_p.format(systemsleepdir))
+meson.add_install_script('sh', '-c', mkdir_p.format(systemgeneratordir))
+meson.add_install_script('sh', '-c', mkdir_p.format(usergeneratordir))
+
+if install_sysconfdir
+ meson.add_install_script('sh', '-c', mkdir_p.format(pkgsysconfdir / 'system'))
+ meson.add_install_script('sh', '-c', mkdir_p.format(pkgsysconfdir / 'user'))
+ meson.add_install_script('sh', '-c', mkdir_p.format(sysconfdir / 'xdg/systemd'))
+endif
+
+############################################################
+
+fuzzers += [
+ [files('fuzz-unit-file.c'),
+ [libcore,
+ libshared],
+ [libmount]],
+]
diff --git a/src/core/mount.c b/src/core/mount.c
new file mode 100644
index 0000000..82ff3a7
--- /dev/null
+++ b/src/core/mount.c
@@ -0,0 +1,2344 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/epoll.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "dbus-mount.h"
+#include "dbus-unit.h"
+#include "device.h"
+#include "exit-status.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "fstab-util.h"
+#include "libmount-util.h"
+#include "log.h"
+#include "manager.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+#define RETRY_UMOUNT_MAX 32
+
+static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
+ [MOUNT_DEAD] = UNIT_INACTIVE,
+ [MOUNT_MOUNTING] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTED] = UNIT_ACTIVE,
+ [MOUNT_REMOUNTING] = UNIT_RELOADING,
+ [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING,
+ [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING,
+ [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING,
+ [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING,
+ [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING,
+ [MOUNT_FAILED] = UNIT_FAILED,
+ [MOUNT_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static void mount_enter_dead(Mount *m, MountResult f);
+static void mount_enter_mounted(Mount *m, MountResult f);
+static void mount_cycle_clear(Mount *m);
+static int mount_process_proc_self_mountinfo(Manager *m);
+
+static bool MOUNT_STATE_WITH_PROCESS(MountState state) {
+ return IN_SET(state,
+ MOUNT_MOUNTING,
+ MOUNT_MOUNTING_DONE,
+ MOUNT_REMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM,
+ MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_CLEANING);
+}
+
+static MountParameters* get_mount_parameters_fragment(Mount *m) {
+ assert(m);
+
+ if (m->from_fragment)
+ return &m->parameters_fragment;
+
+ return NULL;
+}
+
+static MountParameters* get_mount_parameters(Mount *m) {
+ assert(m);
+
+ if (m->from_proc_self_mountinfo)
+ return &m->parameters_proc_self_mountinfo;
+
+ return get_mount_parameters_fragment(m);
+}
+
+static bool mount_is_network(const MountParameters *p) {
+ assert(p);
+
+ if (fstab_test_option(p->options, "_netdev\0"))
+ return true;
+
+ if (p->fstype && fstype_is_network(p->fstype))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_nofail(const Mount *m) {
+ assert(m);
+
+ if (!m->from_fragment)
+ return false;
+
+ return fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0");
+}
+
+static bool mount_is_loop(const MountParameters *p) {
+ assert(p);
+
+ if (fstab_test_option(p->options, "loop\0"))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_bind(const MountParameters *p) {
+ assert(p);
+
+ if (fstab_test_option(p->options, "bind\0" "rbind\0"))
+ return true;
+
+ if (p->fstype && STR_IN_SET(p->fstype, "bind", "rbind"))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_bound_to_device(Mount *m) {
+ const MountParameters *p;
+
+ assert(m);
+
+ /* Determines whether to place a Requires= or BindsTo= dependency on the backing device unit. We do
+ * this by checking for the x-systemd.device-bound mount option. Iff it is set we use BindsTo=,
+ * otherwise Requires=. But note that we might combine the latter with StopPropagatedFrom=, see
+ * below. */
+
+ p = get_mount_parameters(m);
+ if (!p)
+ return false;
+
+ return fstab_test_option(p->options, "x-systemd.device-bound\0");
+}
+
+static bool mount_propagate_stop(Mount *m) {
+ assert(m);
+
+ if (mount_is_bound_to_device(m)) /* If we are using BindsTo= the stop propagation is implicit, no need to bother */
+ return false;
+
+ return m->from_fragment; /* let's propagate stop whenever this is an explicitly configured unit,
+ * otherwise let's not bother. */
+}
+
+static bool mount_needs_quota(const MountParameters *p) {
+ assert(p);
+
+ if (p->fstype && !fstype_needs_quota(p->fstype))
+ return false;
+
+ if (mount_is_bind(p))
+ return false;
+
+ return fstab_test_option(p->options,
+ "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0");
+}
+
+static void mount_init(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ m->timeout_usec = u->manager->default_timeout_start_usec;
+
+ m->exec_context.std_output = u->manager->default_std_output;
+ m->exec_context.std_error = u->manager->default_std_error;
+
+ m->directory_mode = 0755;
+
+ /* We need to make sure that /usr/bin/mount is always called
+ * in the same process group as us, so that the autofs kernel
+ * side doesn't send us another mount request while we are
+ * already trying to comply its last one. */
+ m->exec_context.same_pgrp = true;
+
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+ u->ignore_on_isolate = true;
+}
+
+static int mount_arm_timer(Mount *m, usec_t usec) {
+ int r;
+
+ assert(m);
+
+ if (usec == USEC_INFINITY)
+ return sd_event_source_set_enabled(m->timer_event_source, SD_EVENT_OFF);
+
+ if (m->timer_event_source) {
+ r = sd_event_source_set_time(m->timer_event_source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(m->timer_event_source, SD_EVENT_ONESHOT);
+ }
+
+ r = sd_event_add_time(
+ UNIT(m)->manager->event,
+ &m->timer_event_source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ mount_dispatch_timer, m);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->timer_event_source, "mount-timer");
+
+ return 0;
+}
+
+static void mount_unwatch_control_pid(Mount *m) {
+ assert(m);
+
+ if (m->control_pid <= 0)
+ return;
+
+ unit_unwatch_pid(UNIT(m), TAKE_PID(m->control_pid));
+}
+
+static void mount_parameters_done(MountParameters *p) {
+ assert(p);
+
+ p->what = mfree(p->what);
+ p->options = mfree(p->options);
+ p->fstype = mfree(p->fstype);
+}
+
+static void mount_done(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ m->where = mfree(m->where);
+
+ mount_parameters_done(&m->parameters_proc_self_mountinfo);
+ mount_parameters_done(&m->parameters_fragment);
+
+ m->exec_runtime = exec_runtime_unref(m->exec_runtime, false);
+ exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+ m->control_command = NULL;
+
+ dynamic_creds_unref(&m->dynamic_creds);
+
+ mount_unwatch_control_pid(m);
+
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+}
+
+static int update_parameters_proc_self_mountinfo(
+ Mount *m,
+ const char *what,
+ const char *options,
+ const char *fstype) {
+
+ MountParameters *p;
+ int r, q, w;
+
+ p = &m->parameters_proc_self_mountinfo;
+
+ r = free_and_strdup(&p->what, what);
+ if (r < 0)
+ return r;
+
+ q = free_and_strdup(&p->options, options);
+ if (q < 0)
+ return q;
+
+ w = free_and_strdup(&p->fstype, fstype);
+ if (w < 0)
+ return w;
+
+ return r > 0 || q > 0 || w > 0;
+}
+
+static int mount_add_mount_dependencies(Mount *m) {
+ MountParameters *pm;
+ Unit *other;
+ Set *s;
+ int r;
+
+ assert(m);
+
+ if (!path_equal(m->where, "/")) {
+ _cleanup_free_ char *parent = NULL;
+
+ /* Adds in links to other mount points that might lie further up in the hierarchy */
+
+ r = path_extract_directory(m->where, &parent);
+ if (r < 0)
+ return r;
+
+ r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+ }
+
+ /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount
+ * or a loop mount) to be available. */
+ pm = get_mount_parameters_fragment(m);
+ if (pm && pm->what &&
+ path_is_absolute(pm->what) &&
+ (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) {
+
+ r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */
+ s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where);
+ SET_FOREACH(other, s) {
+
+ if (other->load_state != UNIT_LOADED)
+ continue;
+
+ if (other == UNIT(m))
+ continue;
+
+ r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+ if (r < 0)
+ return r;
+
+ if (UNIT(m)->fragment_path) {
+ /* If we have fragment configuration, then make this dependency required */
+ r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int mount_add_device_dependencies(Mount *m) {
+ UnitDependencyMask mask;
+ MountParameters *p;
+ UnitDependency dep;
+ int r;
+
+ assert(m);
+
+ log_unit_trace(UNIT(m), "Processing implicit device dependencies");
+
+ p = get_mount_parameters(m);
+ if (!p) {
+ log_unit_trace(UNIT(m), "Missing mount parameters, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (!p->what) {
+ log_unit_trace(UNIT(m), "Missing mount source, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (mount_is_bind(p)) {
+ log_unit_trace(UNIT(m), "Mount unit is a bind mount, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (!is_device_path(p->what)) {
+ log_unit_trace(UNIT(m), "Mount source is not a device path, skipping implicit device dependencies");
+ return 0;
+ }
+
+ /* /dev/root is a really weird thing, it's not a real device, but just a path the kernel exports for
+ * the root file system specified on the kernel command line. Ignore it here. */
+ if (PATH_IN_SET(p->what, "/dev/root", "/dev/nfs")) {
+ log_unit_trace(UNIT(m), "Mount source is in /dev/root or /dev/nfs, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (path_equal(m->where, "/")) {
+ log_unit_trace(UNIT(m), "Mount destination is '/', skipping implicit device dependencies");
+ return 0;
+ }
+
+ /* Mount units from /proc/self/mountinfo are not bound to devices by default since they're subject to
+ * races when mounts are established by other tools with different backing devices than what we
+ * maintain. The user can still force this to be a BindsTo= dependency with an appropriate option (or
+ * udev property) so the mount units are automatically stopped when the device disappears
+ * suddenly. */
+ dep = mount_is_bound_to_device(m) ? UNIT_BINDS_TO : UNIT_REQUIRES;
+
+ /* We always use 'what' from /proc/self/mountinfo if mounted */
+ mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+ r = unit_add_node_dependency(UNIT(m), p->what, dep, mask);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(dep), p->what);
+
+ if (mount_propagate_stop(m)) {
+ r = unit_add_node_dependency(UNIT(m), p->what, UNIT_STOP_PROPAGATED_FROM, mask);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s",
+ unit_dependency_to_string(UNIT_STOP_PROPAGATED_FROM), p->what);
+ }
+
+ r = unit_add_blockdev_dependency(UNIT(m), p->what, mask);
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(UNIT_AFTER), p->what);
+
+ return 0;
+}
+
+static int mount_add_quota_dependencies(Mount *m) {
+ MountParameters *p;
+ int r;
+
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(UNIT(m)->manager))
+ return 0;
+
+ p = get_mount_parameters_fragment(m);
+ if (!p)
+ return 0;
+
+ if (!mount_needs_quota(p))
+ return 0;
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE,
+ /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE,
+ /* add_reference= */true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static bool mount_is_extrinsic(Unit *u) {
+ MountParameters *p;
+ Mount *m = MOUNT(u);
+ assert(m);
+
+ /* Returns true for all units that are "magic" and should be excluded from the usual
+ * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally
+ * mounted outside of the systemd dependency logic. We shouldn't attempt to manage them
+ * ourselves but it's fine if the user operates on them with us. */
+
+ /* We only automatically manage mounts if we are in system mode */
+ if (MANAGER_IS_USER(u->manager))
+ return true;
+
+ p = get_mount_parameters(m);
+ if (p && fstab_is_extrinsic(m->where, p->options))
+ return true;
+
+ return false;
+}
+
+static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+ const char *after, *before, *e;
+ int r;
+
+ assert(m);
+
+ e = path_startswith(m->where, "/sysroot");
+ if (e && in_initrd()) {
+ /* All mounts under /sysroot need to happen later, at initrd-fs.target time. IOW,
+ * it's not technically part of the basic initrd filesystem itself, and so
+ * shouldn't inherit the default Before=local-fs.target dependency. However,
+ * these mounts still need to start after local-fs-pre.target, as a sync point
+ * for things like systemd-hibernate-resume@.service that should start before
+ * any mounts. */
+
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = isempty(e) ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_INITRD_FS_TARGET;
+
+ } else if (in_initrd() && path_startswith(m->where, "/sysusr/usr")) {
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = SPECIAL_INITRD_USR_FS_TARGET;
+
+ } else if (mount_is_network(p)) {
+ after = SPECIAL_REMOTE_FS_PRE_TARGET;
+ before = SPECIAL_REMOTE_FS_TARGET;
+
+ } else {
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = SPECIAL_LOCAL_FS_TARGET;
+ }
+
+ if (!mount_is_nofail(m)) {
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET,
+ /* add_reference= */ true, mask);
+}
+
+static int mount_add_default_dependencies(Mount *m) {
+ UnitDependencyMask mask;
+ MountParameters *p;
+ int r;
+
+ assert(m);
+
+ if (!UNIT(m)->default_dependencies)
+ return 0;
+
+ /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are
+ * guaranteed to stay mounted the whole time, since our system is on it. Also, don't
+ * bother with anything mounted below virtual file systems, it's also going to be virtual,
+ * and hence not worth the effort. */
+ if (mount_is_extrinsic(UNIT(m)))
+ return 0;
+
+ p = get_mount_parameters(m);
+ if (!p)
+ return 0;
+
+ mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+ r = mount_add_default_ordering_dependencies(m, p, mask);
+ if (r < 0)
+ return r;
+
+ if (mount_is_network(p)) {
+ /* We order ourselves after network.target. This is primarily useful at shutdown:
+ * services that take down the network should order themselves before
+ * network.target, so that they are shut down only after this mount unit is
+ * stopped. */
+
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+
+ /* We pull in network-online.target, and order ourselves after it. This is useful
+ * at start-up to actively pull in tools that want to be started before we start
+ * mounting network file systems, and whose purpose it is to delay this until the
+ * network is "up". */
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */
+ if (streq_ptr(p->fstype, "tmpfs")) {
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int mount_verify(Mount *m) {
+ _cleanup_free_ char *e = NULL;
+ MountParameters *p;
+ int r;
+
+ assert(m);
+ assert(UNIT(m)->load_state == UNIT_LOADED);
+
+ if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual)
+ return -ENOENT;
+
+ r = unit_name_from_path(m->where, ".mount", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m");
+
+ if (!unit_has_name(UNIT(m), e))
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+ if (mount_point_is_api(m->where) || mount_point_ignore(m->where))
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Cannot create mount unit for API file system %s. Refusing.", m->where);
+
+ p = get_mount_parameters_fragment(m);
+ if (p && !p->what && !UNIT(m)->perpetual)
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC),
+ "What= setting is missing. Refusing.");
+
+ if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing.");
+
+ return 0;
+}
+
+static int mount_add_non_exec_dependencies(Mount *m) {
+ int r;
+
+ assert(m);
+
+ /* We may be called due to this mount appearing in /proc/self/mountinfo, hence we clear all existing
+ * dependencies that were initialized from the unit file but whose final value really depends on the
+ * content of /proc/self/mountinfo. Some (such as m->where) might have become stale now. */
+ unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO | UNIT_DEPENDENCY_MOUNT_FILE);
+
+ if (!m->where)
+ return 0;
+
+ /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies
+ * resulting from the ExecContext and such. */
+
+ r = mount_add_device_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_mount_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_quota_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_default_dependencies(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int mount_add_extras(Mount *m) {
+ Unit *u = UNIT(m);
+ int r;
+
+ assert(m);
+
+ /* Note: this call might be called after we already have been loaded once (and even when it has already been
+ * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready
+ * to run with an already set up unit. */
+
+ if (u->fragment_path)
+ m->from_fragment = true;
+
+ if (!m->where) {
+ r = unit_name_to_path(u->id, &m->where);
+ if (r == -ENAMETOOLONG)
+ log_unit_error_errno(u, r, "Failed to derive mount point path from unit name, because unit name is hashed. "
+ "Set \"Where=\" in the unit file explicitly.");
+ if (r < 0)
+ return r;
+ }
+
+ path_simplify(m->where);
+
+ if (!u->description) {
+ r = unit_set_description(u, m->where);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(u, &m->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(u);
+ if (r < 0)
+ return r;
+
+ r = mount_add_non_exec_dependencies(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static void mount_load_root_mount(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_ROOT_MOUNT))
+ return;
+
+ u->perpetual = true;
+ u->default_dependencies = false;
+
+ /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */
+ MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL;
+ MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL;
+
+ if (!u->description)
+ u->description = strdup("Root Mount");
+}
+
+static int mount_load(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r, q = 0;
+
+ assert(m);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ mount_load_root_mount(u);
+
+ bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual;
+ r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+ /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the
+ * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus
+ * we need to update the state in our unit to track it. After all, consider that we don't allow changing the
+ * 'slice' field for a unit once it is active. */
+ if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual)
+ q = mount_add_extras(m);
+
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ return mount_verify(m);
+}
+
+static void mount_set_state(Mount *m, MountState state) {
+ MountState old_state;
+ assert(m);
+
+ if (m->state != state)
+ bus_unit_send_pending_change_signal(UNIT(m), false);
+
+ old_state = m->state;
+ m->state = state;
+
+ if (!MOUNT_STATE_WITH_PROCESS(state)) {
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+ mount_unwatch_control_pid(m);
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state));
+
+ unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state],
+ m->reload_result == MOUNT_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE);
+}
+
+static int mount_coldplug(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(m->state == MOUNT_DEAD);
+
+ if (m->deserialized_state == m->state)
+ return 0;
+
+ if (m->control_pid > 0 &&
+ pid_is_unwaited(m->control_pid) &&
+ MOUNT_STATE_WITH_PROCESS(m->deserialized_state)) {
+
+ r = unit_watch_pid(UNIT(m), m->control_pid, false);
+ if (r < 0)
+ return r;
+
+ r = mount_arm_timer(m, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED)) {
+ (void) unit_setup_dynamic_creds(u);
+ (void) unit_setup_exec_runtime(u);
+ }
+
+ mount_set_state(m, m->deserialized_state);
+ return 0;
+}
+
+static void mount_catchup(Unit *u) {
+ Mount *m = MOUNT(ASSERT_PTR(u));
+
+ assert(m);
+
+ /* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */
+ if (m->from_proc_self_mountinfo)
+ switch (m->state) {
+ case MOUNT_DEAD:
+ case MOUNT_FAILED:
+ assert(m->control_pid == 0);
+ (void) unit_acquire_invocation_id(u);
+ mount_cycle_clear(m);
+ mount_enter_mounted(m, MOUNT_SUCCESS);
+ break;
+ case MOUNT_MOUNTING:
+ assert(m->control_pid > 0);
+ mount_set_state(m, MOUNT_MOUNTING_DONE);
+ break;
+ default:
+ break;
+ }
+ else
+ switch (m->state) {
+ case MOUNT_MOUNTING_DONE:
+ assert(m->control_pid > 0);
+ mount_set_state(m, MOUNT_MOUNTING);
+ break;
+ case MOUNT_MOUNTED:
+ assert(m->control_pid == 0);
+ mount_enter_dead(m, MOUNT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mount_dump(Unit *u, FILE *f, const char *prefix) {
+ Mount *m = MOUNT(u);
+ MountParameters *p;
+
+ assert(m);
+ assert(f);
+
+ p = get_mount_parameters(m);
+
+ fprintf(f,
+ "%sMount State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sWhere: %s\n"
+ "%sWhat: %s\n"
+ "%sFile System Type: %s\n"
+ "%sOptions: %s\n"
+ "%sFrom /proc/self/mountinfo: %s\n"
+ "%sFrom fragment: %s\n"
+ "%sExtrinsic: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sSloppyOptions: %s\n"
+ "%sLazyUnmount: %s\n"
+ "%sForceUnmount: %s\n"
+ "%sReadWriteOnly: %s\n"
+ "%sTimeoutSec: %s\n",
+ prefix, mount_state_to_string(m->state),
+ prefix, mount_result_to_string(m->result),
+ prefix, mount_result_to_string(m->clean_result),
+ prefix, m->where,
+ prefix, p ? strna(p->what) : "n/a",
+ prefix, p ? strna(p->fstype) : "n/a",
+ prefix, p ? strna(p->options) : "n/a",
+ prefix, yes_no(m->from_proc_self_mountinfo),
+ prefix, yes_no(m->from_fragment),
+ prefix, yes_no(mount_is_extrinsic(u)),
+ prefix, m->directory_mode,
+ prefix, yes_no(m->sloppy_options),
+ prefix, yes_no(m->lazy_unmount),
+ prefix, yes_no(m->force_unmount),
+ prefix, yes_no(m->read_write_only),
+ prefix, FORMAT_TIMESPAN(m->timeout_usec, USEC_PER_SEC));
+
+ if (m->control_pid > 0)
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, m->control_pid);
+
+ exec_context_dump(&m->exec_context, f, prefix);
+ kill_context_dump(&m->kill_context, f, prefix);
+ cgroup_context_dump(UNIT(m), f, prefix);
+}
+
+static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
+
+ _cleanup_(exec_params_clear) ExecParameters exec_params = {
+ .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
+ .stdin_fd = -1,
+ .stdout_fd = -1,
+ .stderr_fd = -1,
+ .exec_fd = -1,
+ };
+ pid_t pid;
+ int r;
+
+ assert(m);
+ assert(c);
+ assert(_pid);
+
+ r = unit_prepare_exec(UNIT(m));
+ if (r < 0)
+ return r;
+
+ r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec));
+ if (r < 0)
+ return r;
+
+ r = unit_set_exec_params(UNIT(m), &exec_params);
+ if (r < 0)
+ return r;
+
+ r = exec_spawn(UNIT(m),
+ c,
+ &m->exec_context,
+ &exec_params,
+ m->exec_runtime,
+ &m->dynamic_creds,
+ &pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pid(UNIT(m), pid, true);
+ if (r < 0)
+ return r;
+
+ *_pid = pid;
+
+ return 0;
+}
+
+static void mount_enter_dead(Mount *m, MountResult f) {
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result));
+ unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_stop);
+
+ mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD);
+
+ m->exec_runtime = exec_runtime_unref(m->exec_runtime, true);
+
+ unit_destroy_runtime_data(UNIT(m), &m->exec_context);
+
+ unit_unref_uid_gid(UNIT(m), true);
+
+ dynamic_creds_destroy(&m->dynamic_creds);
+
+ /* Any dependencies based on /proc/self/mountinfo are now stale. Let's re-generate dependencies from
+ * .mount unit. */
+ (void) mount_add_non_exec_dependencies(m);
+}
+
+static void mount_enter_mounted(Mount *m, MountResult f) {
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ mount_set_state(m, MOUNT_MOUNTED);
+}
+
+static void mount_enter_dead_or_mounted(Mount *m, MountResult f) {
+ assert(m);
+
+ /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this
+ * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all
+ * ultimately we just mirror the kernel's internal state on this. */
+
+ if (m->from_proc_self_mountinfo)
+ mount_enter_mounted(m, f);
+ else
+ mount_enter_dead(m, f);
+}
+
+static int state_to_kill_operation(MountState state) {
+ switch (state) {
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ return KILL_RESTART;
+
+ case MOUNT_UNMOUNTING_SIGTERM:
+ return KILL_TERMINATE;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGKILL:
+ return KILL_KILL;
+
+ default:
+ return _KILL_OPERATION_INVALID;
+ }
+}
+
+static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
+ int r;
+
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ r = unit_kill_context(
+ UNIT(m),
+ &m->kill_context,
+ state_to_kill_operation(state),
+ -1,
+ m->control_pid,
+ false);
+ if (r < 0)
+ goto fail;
+
+ if (r > 0) {
+ r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec));
+ if (r < 0)
+ goto fail;
+
+ mount_set_state(m, state);
+ } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL))
+ mount_enter_mounted(m, MOUNT_SUCCESS);
+ else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ else
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static void mount_enter_unmounting(Mount *m) {
+ int r;
+
+ assert(m);
+
+ /* Start counting our attempts */
+ if (!IN_SET(m->state,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL))
+ m->n_retry_umount = 0;
+
+ m->control_command_id = MOUNT_EXEC_UNMOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT;
+
+ r = exec_command_set(m->control_command, UMOUNT_PATH, m->where, "-c", NULL);
+ if (r >= 0 && m->lazy_unmount)
+ r = exec_command_append(m->control_command, "-l", NULL);
+ if (r >= 0 && m->force_unmount)
+ r = exec_command_append(m->control_command, "-f", NULL);
+ if (r < 0)
+ goto fail;
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0)
+ goto fail;
+
+ mount_set_state(m, MOUNT_UNMOUNTING);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(m), r, "Failed to run 'umount' task: %m");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static void mount_enter_mounting(Mount *m) {
+ int r;
+ MountParameters *p;
+ bool source_is_dir = true;
+
+ assert(m);
+
+ r = unit_fail_if_noncanonical(UNIT(m), m->where);
+ if (r < 0)
+ goto fail;
+
+ p = get_mount_parameters_fragment(m);
+ if (p && mount_is_bind(p)) {
+ r = is_dir(p->what, /* follow = */ true);
+ if (r < 0 && r != -ENOENT)
+ log_unit_info_errno(UNIT(m), r, "Failed to determine type of bind mount source '%s', ignoring: %m", p->what);
+ else if (r == 0)
+ source_is_dir = false;
+ }
+
+ if (source_is_dir)
+ r = mkdir_p_label(m->where, m->directory_mode);
+ else
+ r = touch_file(m->where, /* parents = */ true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+ if (r < 0 && r != -EEXIST)
+ log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where);
+
+ if (source_is_dir)
+ unit_warn_if_dir_nonempty(UNIT(m), m->where);
+ unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start);
+
+ m->control_command_id = MOUNT_EXEC_MOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_MOUNT;
+
+ /* Create the source directory for bind-mounts if needed */
+ if (p && mount_is_bind(p)) {
+ r = mkdir_p_label(p->what, m->directory_mode);
+ /* mkdir_p_label() can return -EEXIST if the target path exists and is not a directory - which is
+ * totally OK, in case the user wants us to overmount a non-directory inode. Also -EROFS can be
+ * returned on read-only filesystem. Moreover, -EACCES (and also maybe -EPERM?) may be returned
+ * when the path is on NFS. See issue #24120. All such errors will be logged in the debug level. */
+ if (r < 0 && r != -EEXIST)
+ log_unit_full_errno(UNIT(m),
+ (r == -EROFS || ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to make bind mount source '%s', ignoring: %m", p->what);
+ }
+
+ if (p) {
+ _cleanup_free_ char *opts = NULL;
+
+ r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, NULL, &opts);
+ if (r < 0)
+ goto fail;
+
+ r = exec_command_set(m->control_command, MOUNT_PATH, p->what, m->where, NULL);
+ if (r >= 0 && m->sloppy_options)
+ r = exec_command_append(m->control_command, "-s", NULL);
+ if (r >= 0 && m->read_write_only)
+ r = exec_command_append(m->control_command, "-w", NULL);
+ if (r >= 0 && p->fstype)
+ r = exec_command_append(m->control_command, "-t", p->fstype, NULL);
+ if (r >= 0 && !isempty(opts))
+ r = exec_command_append(m->control_command, "-o", opts, NULL);
+ } else
+ r = -ENOENT;
+ if (r < 0)
+ goto fail;
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0)
+ goto fail;
+
+ mount_set_state(m, MOUNT_MOUNTING);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(m), r, "Failed to run 'mount' task: %m");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static void mount_set_reload_result(Mount *m, MountResult result) {
+ assert(m);
+
+ /* Only store the first error we encounter */
+ if (m->reload_result != MOUNT_SUCCESS)
+ return;
+
+ m->reload_result = result;
+}
+
+static void mount_enter_remounting(Mount *m) {
+ int r;
+ MountParameters *p;
+
+ assert(m);
+
+ /* Reset reload result when we are about to start a new remount operation */
+ m->reload_result = MOUNT_SUCCESS;
+
+ m->control_command_id = MOUNT_EXEC_REMOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT;
+
+ p = get_mount_parameters_fragment(m);
+ if (p) {
+ const char *o;
+
+ if (p->options)
+ o = strjoina("remount,", p->options);
+ else
+ o = "remount";
+
+ r = exec_command_set(m->control_command, MOUNT_PATH,
+ p->what, m->where,
+ "-o", o, NULL);
+ if (r >= 0 && m->sloppy_options)
+ r = exec_command_append(m->control_command, "-s", NULL);
+ if (r >= 0 && m->read_write_only)
+ r = exec_command_append(m->control_command, "-w", NULL);
+ if (r >= 0 && p->fstype)
+ r = exec_command_append(m->control_command, "-t", p->fstype, NULL);
+ } else
+ r = -ENOENT;
+ if (r < 0)
+ goto fail;
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0)
+ goto fail;
+
+ mount_set_state(m, MOUNT_REMOUNTING);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(m), r, "Failed to run 'remount' task: %m");
+ mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES);
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+}
+
+static void mount_cycle_clear(Mount *m) {
+ assert(m);
+
+ /* Clear all state we shall forget for this new cycle */
+
+ m->result = MOUNT_SUCCESS;
+ m->reload_result = MOUNT_SUCCESS;
+ exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+ UNIT(m)->reset_accounting = true;
+}
+
+static int mount_start(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(m->state,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(m->state, MOUNT_MOUNTING, MOUNT_MOUNTING_DONE))
+ return 0;
+
+ assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ mount_cycle_clear(m);
+ mount_enter_mounting(m);
+
+ return 1;
+}
+
+static int mount_stop(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ switch (m->state) {
+
+ case MOUNT_UNMOUNTING:
+ case MOUNT_UNMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGTERM:
+ /* Already on it */
+ return 0;
+
+ case MOUNT_MOUNTING:
+ case MOUNT_MOUNTING_DONE:
+ case MOUNT_REMOUNTING:
+ /* If we are still waiting for /bin/mount, we go directly into kill mode. */
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS);
+ return 0;
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ /* If we are already waiting for a hung remount, convert this to the matching unmounting state */
+ mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM);
+ return 0;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ /* as above */
+ mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL);
+ return 0;
+
+ case MOUNT_MOUNTED:
+ mount_enter_unmounting(m);
+ return 1;
+
+ case MOUNT_CLEANING:
+ /* If we are currently cleaning, then abort it, brutally. */
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int mount_reload(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(m->state == MOUNT_MOUNTED);
+
+ mount_enter_remounting(m);
+
+ return 1;
+}
+
+static int mount_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", mount_state_to_string(m->state));
+ (void) serialize_item(f, "result", mount_result_to_string(m->result));
+ (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result));
+ (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount);
+
+ if (m->control_pid > 0)
+ (void) serialize_item_format(f, "control-pid", PID_FMT, m->control_pid);
+
+ if (m->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id));
+
+ return 0;
+}
+
+static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ MountState state;
+
+ state = mount_state_from_string(value);
+ if (state < 0)
+ log_unit_debug_errno(u, state, "Failed to parse state value: %s", value);
+ else
+ m->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ MountResult f;
+
+ f = mount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug_errno(u, f, "Failed to parse result value: %s", value);
+ else if (f != MOUNT_SUCCESS)
+ m->result = f;
+
+ } else if (streq(key, "reload-result")) {
+ MountResult f;
+
+ f = mount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug_errno(u, f, "Failed to parse reload result value: %s", value);
+ else if (f != MOUNT_SUCCESS)
+ m->reload_result = f;
+
+ } else if (streq(key, "n-retry-umount")) {
+
+ r = safe_atou(value, &m->n_retry_umount);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse n-retry-umount value: %s", value);
+
+ } else if (streq(key, "control-pid")) {
+
+ r = parse_pid(value, &m->control_pid);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse control-pid value: %s", value);
+
+ } else if (streq(key, "control-command")) {
+ MountExecCommand id;
+
+ id = mount_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug_errno(u, id, "Failed to parse exec-command value: %s", value);
+ else {
+ m->control_command_id = id;
+ m->control_command = m->exec_command + id;
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState mount_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[MOUNT(u)->state];
+}
+
+_pure_ static const char *mount_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return mount_state_to_string(MOUNT(u)->state);
+}
+
+_pure_ static bool mount_may_gc(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ if (m->from_proc_self_mountinfo)
+ return false;
+
+ return true;
+}
+
+static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Mount *m = MOUNT(u);
+ MountResult f;
+
+ assert(m);
+ assert(pid >= 0);
+
+ if (pid != m->control_pid)
+ return;
+
+ /* So here's the thing, we really want to know before /usr/bin/mount or /usr/bin/umount exit whether
+ * they established/remove a mount. This is important when mounting, but even more so when unmounting
+ * since we need to deal with nested mounts and otherwise cannot safely determine whether to repeat
+ * the unmounts. In theory, the kernel fires /proc/self/mountinfo changes off before returning from
+ * the mount() or umount() syscalls, and thus we should see the changes to the proc file before we
+ * process the waitid() for the /usr/bin/(u)mount processes. However, this is unfortunately racy: we
+ * have to waitid() for processes using P_ALL (since we need to reap unexpected children that got
+ * reparented to PID 1), but when using P_ALL we might end up reaping processes that terminated just
+ * instants ago, i.e. already after our last event loop iteration (i.e. after the last point we might
+ * have noticed /proc/self/mountinfo events via epoll). This means event loop priorities for
+ * processing SIGCHLD vs. /proc/self/mountinfo IO events are not as relevant as we want. To fix that
+ * race, let's explicitly scan /proc/self/mountinfo before we start processing /usr/bin/(u)mount
+ * dying. It's ugly, but it makes our ordering systematic again, and makes sure we always see
+ * /proc/self/mountinfo changes before our mount/umount exits. */
+ (void) mount_process_proc_self_mountinfo(u->manager);
+
+ m->control_pid = 0;
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = MOUNT_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = MOUNT_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = MOUNT_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = MOUNT_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM))
+ mount_set_reload_result(m, f);
+ else if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ if (m->control_command) {
+ exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status);
+
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Mount process",
+ mount_exec_command_to_string(m->control_command_id),
+ f == MOUNT_SUCCESS,
+ code, status);
+
+ /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded
+ * before we process the SIGCHLD for the mount command. */
+
+ switch (m->state) {
+
+ case MOUNT_MOUNTING:
+ /* Our mount point has not appeared in mountinfo. Something went wrong. */
+
+ if (f == MOUNT_SUCCESS) {
+ /* Either /bin/mount has an unexpected definition of success,
+ * or someone raced us and we lost. */
+ log_unit_warning(UNIT(m), "Mount process finished, but there is no mount.");
+ f = MOUNT_FAILURE_PROTOCOL;
+ }
+ mount_enter_dead(m, f);
+ break;
+
+ case MOUNT_MOUNTING_DONE:
+ mount_enter_mounted(m, f);
+ break;
+
+ case MOUNT_REMOUNTING:
+ case MOUNT_REMOUNTING_SIGTERM:
+ case MOUNT_REMOUNTING_SIGKILL:
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_UNMOUNTING:
+
+ if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) {
+
+ /* Still a mount point? If so, let's try again. Most likely there were multiple mount points
+ * stacked on top of each other. We might exceed the timeout specified by the user overall,
+ * but we will stop as soon as any one umount times out. */
+
+ if (m->n_retry_umount < RETRY_UMOUNT_MAX) {
+ log_unit_debug(u, "Mount still present, trying again.");
+ m->n_retry_umount++;
+ mount_enter_unmounting(m);
+ } else {
+ log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount);
+ mount_enter_mounted(m, f);
+ }
+ } else
+ mount_enter_dead_or_mounted(m, f);
+
+ break;
+
+ case MOUNT_UNMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGTERM:
+ mount_enter_dead_or_mounted(m, f);
+ break;
+
+ case MOUNT_CLEANING:
+ if (m->clean_result == MOUNT_SUCCESS)
+ m->clean_result = f;
+
+ mount_enter_dead(m, MOUNT_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Mount *m = MOUNT(userdata);
+
+ assert(m);
+ assert(m->timer_event_source == source);
+
+ switch (m->state) {
+
+ case MOUNT_MOUNTING:
+ case MOUNT_MOUNTING_DONE:
+ log_unit_warning(UNIT(m), "Mounting timed out. Terminating.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_REMOUNTING:
+ log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process.");
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+ if (m->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(m), "Remounting timed out. Killing.");
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ } else {
+ log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ }
+ break;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+ log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_UNMOUNTING:
+ log_unit_warning(UNIT(m), "Unmounting timed out. Terminating.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_UNMOUNTING_SIGTERM:
+ if (m->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(m), "Mount process timed out. Killing.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case MOUNT_UNMOUNTING_SIGKILL:
+ log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_CLEANING:
+ log_unit_warning(UNIT(m), "Cleaning timed out. killing.");
+
+ if (m->clean_result == MOUNT_SUCCESS)
+ m->clean_result = MOUNT_FAILURE_TIMEOUT;
+
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int mount_setup_new_unit(
+ Manager *m,
+ const char *name,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ MountProcFlags *ret_flags,
+ Unit **ret) {
+
+ _cleanup_(unit_freep) Unit *u = NULL;
+ int r;
+
+ assert(m);
+ assert(name);
+ assert(ret_flags);
+ assert(ret);
+
+ r = unit_new_for_name(m, sizeof(Mount), name, &u);
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&u->source_path, "/proc/self/mountinfo");
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&MOUNT(u)->where, where);
+ if (r < 0)
+ return r;
+
+ r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ if (r < 0)
+ return r;
+
+ /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the
+ * time we load the unit file for it (and thus add in extra deps right after) we know what source to
+ * attributes the deps to. */
+ MOUNT(u)->from_proc_self_mountinfo = true;
+
+ r = mount_add_non_exec_dependencies(MOUNT(u));
+ if (r < 0)
+ return r;
+
+ /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything
+ * else is loaded in now. */
+ unit_add_to_load_queue(u);
+
+ *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED;
+ *ret = TAKE_PTR(u);
+ return 0;
+}
+
+static int mount_setup_existing_unit(
+ Unit *u,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ MountProcFlags *ret_flags) {
+
+ int r;
+
+ assert(u);
+ assert(ret_flags);
+
+ if (!MOUNT(u)->where) {
+ MOUNT(u)->where = strdup(where);
+ if (!MOUNT(u)->where)
+ return -ENOMEM;
+ }
+
+ /* In case we have multiple mounts established on the same mount point, let's merge flags set already
+ * for the current unit. Note that the flags field is reset on each iteration of reading
+ * /proc/self/mountinfo, hence we know for sure anything already set here is from the current
+ * iteration and thus worthy of taking into account. */
+ MountProcFlags flags =
+ MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED;
+
+ r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ flags |= MOUNT_PROC_JUST_CHANGED;
+
+ /* There are two conditions when we consider a mount point just mounted: when we haven't seen it in
+ * /proc/self/mountinfo before or when MOUNT_MOUNTING is our current state. Why bother with the
+ * latter? Shouldn't that be covered by the former? No, during reload it is not because we might then
+ * encounter a new /proc/self/mountinfo in combination with an old mount unit state (since it stems
+ * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is
+ * reached when we wait for the mount to appear we hence can assume that if we are in it, we are
+ * actually seeing it established for the first time. */
+ if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING)
+ flags |= MOUNT_PROC_JUST_MOUNTED;
+
+ MOUNT(u)->from_proc_self_mountinfo = true;
+
+ if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in
+ * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */
+ u->load_state = UNIT_LOADED;
+ u->load_error = 0;
+
+ flags |= MOUNT_PROC_JUST_CHANGED;
+ }
+
+ if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) {
+ /* If things changed, then make sure that all deps are regenerated. Let's
+ * first remove all automatic deps, and then add in the new ones. */
+ r = mount_add_non_exec_dependencies(MOUNT(u));
+ if (r < 0)
+ return r;
+ }
+
+ *ret_flags = flags;
+ return 0;
+}
+
+static int mount_setup_unit(
+ Manager *m,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ bool set_flags) {
+
+ _cleanup_free_ char *e = NULL;
+ MountProcFlags flags;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(what);
+ assert(where);
+ assert(options);
+ assert(fstype);
+
+ /* Ignore API mount points. They should never be referenced in
+ * dependencies ever. */
+ if (mount_point_is_api(where) || mount_point_ignore(where))
+ return 0;
+
+ if (streq(fstype, "autofs"))
+ return 0;
+
+ /* probably some kind of swap, ignore */
+ if (!is_path(where))
+ return 0;
+
+ r = unit_name_from_path(where, ".mount", &e);
+ if (r < 0)
+ return log_struct_errno(
+ LOG_WARNING, r,
+ "MESSAGE_ID=" SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR,
+ "MOUNT_POINT=%s", where,
+ LOG_MESSAGE("Failed to generate valid unit name from mount point path '%s', ignoring mount point: %m",
+ where));
+
+ u = manager_get_unit(m, e);
+ if (u)
+ r = mount_setup_existing_unit(u, what, where, options, fstype, &flags);
+ else
+ /* First time we see this mount point meaning that it's not been initiated by a mount unit
+ * but rather by the sysadmin having called mount(8) directly. */
+ r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to set up mount unit for '%s': %m", where);
+
+ /* If the mount changed properties or state, let's notify our clients */
+ if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED))
+ unit_add_to_dbus_queue(u);
+
+ if (set_flags)
+ MOUNT(u)->proc_flags = flags;
+
+ return 0;
+}
+
+static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) {
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ int r;
+
+ assert(m);
+
+ r = libmount_parse(NULL, NULL, &table, &iter);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+ for (;;) {
+ struct libmnt_fs *fs;
+ const char *device, *path, *options, *fstype;
+
+ r = mnt_table_next_fs(table, iter, &fs);
+ if (r == 1)
+ break;
+ if (r < 0)
+ return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+ device = mnt_fs_get_source(fs);
+ path = mnt_fs_get_target(fs);
+ options = mnt_fs_get_options(fs);
+ fstype = mnt_fs_get_fstype(fs);
+
+ if (!device || !path)
+ continue;
+
+ device_found_node(m, device, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT);
+
+ (void) mount_setup_unit(m, device, path, options, fstype, set_flags);
+ }
+
+ return 0;
+}
+
+static void mount_shutdown(Manager *m) {
+ assert(m);
+
+ m->mount_event_source = sd_event_source_disable_unref(m->mount_event_source);
+
+ mnt_unref_monitor(m->mount_monitor);
+ m->mount_monitor = NULL;
+}
+
+static int mount_get_timeout(Unit *u, usec_t *timeout) {
+ Mount *m = MOUNT(u);
+ usec_t t;
+ int r;
+
+ assert(m);
+ assert(u);
+
+ if (!m->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(m->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static void mount_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's
+ * unconditionally synthesize it here and mark it as perpetual. */
+
+ u = manager_get_unit(m, SPECIAL_ROOT_MOUNT);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m");
+ return;
+ }
+ }
+
+ u->perpetual = true;
+ MOUNT(u)->deserialized_state = MOUNT_MOUNTED;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+}
+
+static bool mount_is_mounted(Mount *m) {
+ assert(m);
+
+ return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED);
+}
+
+static int mount_on_ratelimit_expire(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+
+ /* Let's enqueue all start jobs that were previously skipped because of active ratelimit. */
+ HASHMAP_FOREACH(j, m->jobs) {
+ if (j->unit->type != UNIT_MOUNT)
+ continue;
+
+ job_add_to_run_queue(j);
+ }
+
+ /* By entering ratelimited state we made all mount start jobs not runnable, now rate limit is over so
+ * let's make sure we dispatch them in the next iteration. */
+ manager_trigger_run_queue(m);
+
+ return 0;
+}
+
+static void mount_enumerate(Manager *m) {
+ int r;
+
+ assert(m);
+
+ mnt_init_debug(0);
+
+ if (!m->mount_monitor) {
+ unsigned mount_rate_limit_burst = 5;
+ int fd;
+
+ m->mount_monitor = mnt_new_monitor();
+ if (!m->mount_monitor) {
+ log_oom();
+ goto fail;
+ }
+
+ r = mnt_monitor_enable_kernel(m->mount_monitor, 1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable watching of kernel mount events: %m");
+ goto fail;
+ }
+
+ r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable watching of userspace mount events: %m");
+ goto fail;
+ }
+
+ /* mnt_unref_monitor() will close the fd */
+ fd = r = mnt_monitor_get_fd(m->mount_monitor);
+ if (r < 0) {
+ log_error_errno(r, "Failed to acquire watch file descriptor: %m");
+ goto fail;
+ }
+
+ r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to watch mount file descriptor: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ if (r < 0) {
+ log_error_errno(r, "Failed to adjust mount watch priority: %m");
+ goto fail;
+ }
+
+ /* Let users override the default (5 in 1s), as it stalls the boot sequence on busy systems. */
+ const char *e = secure_getenv("SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST");
+ if (e) {
+ r = safe_atou(e, &mount_rate_limit_burst);
+ if (r < 0)
+ log_debug("Invalid value in $SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST, ignoring: %s", e);
+ }
+
+ r = sd_event_source_set_ratelimit(m->mount_event_source, 1 * USEC_PER_SEC, mount_rate_limit_burst);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_ratelimit_expire_callback(m->mount_event_source, mount_on_ratelimit_expire);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch");
+ }
+
+ r = mount_load_proc_self_mountinfo(m, false);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ mount_shutdown(m);
+}
+
+static int drain_libmount(Manager *m) {
+ bool rescan = false;
+ int r;
+
+ assert(m);
+
+ /* Drain all events and verify that the event is valid.
+ *
+ * Note that libmount also monitors /run/mount mkdir if the directory does not exist yet. The mkdir
+ * may generate event which is irrelevant for us.
+ *
+ * error: r < 0; valid: r == 0, false positive: r == 1 */
+ do {
+ r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to drain libmount events: %m");
+ if (r == 0)
+ rescan = true;
+ } while (r == 0);
+
+ return rescan;
+}
+
+static int mount_process_proc_self_mountinfo(Manager *m) {
+ _cleanup_set_free_ Set *around = NULL, *gone = NULL;
+ const char *what;
+ int r;
+
+ assert(m);
+
+ r = drain_libmount(m);
+ if (r <= 0)
+ return r;
+
+ r = mount_load_proc_self_mountinfo(m, true);
+ if (r < 0) {
+ /* Reset flags, just in case, for later calls */
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT])
+ MOUNT(u)->proc_flags = 0;
+
+ return 0;
+ }
+
+ manager_dispatch_load_queue(m);
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) {
+ Mount *mount = MOUNT(u);
+
+ if (!mount_is_mounted(mount)) {
+
+ /* A mount point is not around right now. It
+ * might be gone, or might never have
+ * existed. */
+
+ if (mount->from_proc_self_mountinfo &&
+ mount->parameters_proc_self_mountinfo.what)
+ /* Remember that this device might just have disappeared */
+ if (set_put_strdup_full(&gone, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+ log_oom(); /* we don't care too much about OOM here... */
+
+ mount->from_proc_self_mountinfo = false;
+ assert_se(update_parameters_proc_self_mountinfo(mount, NULL, NULL, NULL) >= 0);
+
+ switch (mount->state) {
+
+ case MOUNT_MOUNTED:
+ /* This has just been unmounted by somebody else, follow the state change. */
+ mount_enter_dead(mount, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_MOUNTING_DONE:
+ /* The mount command may add the corresponding proc mountinfo entry and
+ * then remove it because of an internal error. E.g., fuse.sshfs seems
+ * to do that when the connection fails. See #17617. To handle such the
+ * case, let's once set the state back to mounting. Then, the unit can
+ * correctly enter the failed state later in mount_sigchld(). */
+ mount_set_state(mount, MOUNT_MOUNTING);
+ break;
+
+ default:
+ break;
+ }
+
+ } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) {
+
+ /* A mount point was added or changed */
+
+ switch (mount->state) {
+
+ case MOUNT_DEAD:
+ case MOUNT_FAILED:
+
+ /* This has just been mounted by somebody else, follow the state change, but let's
+ * generate a new invocation ID for this implicitly and automatically. */
+ (void) unit_acquire_invocation_id(u);
+ mount_cycle_clear(mount);
+ mount_enter_mounted(mount, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_MOUNTING:
+ mount_set_state(mount, MOUNT_MOUNTING_DONE);
+ break;
+
+ default:
+ /* Nothing really changed, but let's
+ * issue an notification call
+ * nonetheless, in case somebody is
+ * waiting for this. (e.g. file system
+ * ro/rw remounts.) */
+ mount_set_state(mount, mount->state);
+ break;
+ }
+ }
+
+ if (mount_is_mounted(mount) &&
+ mount->from_proc_self_mountinfo &&
+ mount->parameters_proc_self_mountinfo.what)
+ /* Track devices currently used */
+ if (set_put_strdup_full(&around, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+ log_oom();
+
+ /* Reset the flags for later calls */
+ mount->proc_flags = 0;
+ }
+
+ SET_FOREACH(what, gone) {
+ if (set_contains(around, what))
+ continue;
+
+ /* Let the device units know that the device is no longer mounted */
+ device_found_node(m, what, DEVICE_NOT_FOUND, DEVICE_FOUND_MOUNT);
+ }
+
+ return 0;
+}
+
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(revents & EPOLLIN);
+
+ return mount_process_proc_self_mountinfo(m);
+}
+
+static void mount_reset_failed(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ if (m->state == MOUNT_FAILED)
+ mount_set_state(m, MOUNT_DEAD);
+
+ m->result = MOUNT_SUCCESS;
+ m->reload_result = MOUNT_SUCCESS;
+ m->clean_result = MOUNT_SUCCESS;
+}
+
+static int mount_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ return unit_kill_common(u, who, signo, -1, m->control_pid, error);
+}
+
+static int mount_control_pid(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ return m->control_pid;
+}
+
+static int mount_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(mask != 0);
+
+ if (m->state != MOUNT_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&m->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ mount_unwatch_control_pid(m);
+ m->clean_result = MOUNT_SUCCESS;
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+ r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->exec_context.timeout_clean_usec));
+ if (r < 0)
+ goto fail;
+
+ r = unit_fork_and_watch_rm_rf(u, l, &m->control_pid);
+ if (r < 0)
+ goto fail;
+
+ mount_set_state(m, MOUNT_CLEANING);
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m");
+ m->clean_result = MOUNT_FAILURE_RESOURCES;
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+ return r;
+}
+
+static int mount_can_clean(Unit *u, ExecCleanMask *ret) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ return exec_context_get_clean_mask(&m->exec_context, ret);
+}
+
+static int mount_can_start(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = {
+ [MOUNT_EXEC_MOUNT] = "ExecMount",
+ [MOUNT_EXEC_UNMOUNT] = "ExecUnmount",
+ [MOUNT_EXEC_REMOUNT] = "ExecRemount",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand);
+
+static const char* const mount_result_table[_MOUNT_RESULT_MAX] = {
+ [MOUNT_SUCCESS] = "success",
+ [MOUNT_FAILURE_RESOURCES] = "resources",
+ [MOUNT_FAILURE_TIMEOUT] = "timeout",
+ [MOUNT_FAILURE_EXIT_CODE] = "exit-code",
+ [MOUNT_FAILURE_SIGNAL] = "signal",
+ [MOUNT_FAILURE_CORE_DUMP] = "core-dump",
+ [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [MOUNT_FAILURE_PROTOCOL] = "protocol",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult);
+
+const UnitVTable mount_vtable = {
+ .object_size = sizeof(Mount),
+ .exec_context_offset = offsetof(Mount, exec_context),
+ .cgroup_context_offset = offsetof(Mount, cgroup_context),
+ .kill_context_offset = offsetof(Mount, kill_context),
+ .exec_runtime_offset = offsetof(Mount, exec_runtime),
+ .dynamic_creds_offset = offsetof(Mount, dynamic_creds),
+
+ .sections =
+ "Unit\0"
+ "Mount\0"
+ "Install\0",
+ .private_section = "Mount",
+
+ .can_transient = true,
+ .can_fail = true,
+ .exclude_from_switch_root_serialization = true,
+
+ .init = mount_init,
+ .load = mount_load,
+ .done = mount_done,
+
+ .coldplug = mount_coldplug,
+ .catchup = mount_catchup,
+
+ .dump = mount_dump,
+
+ .start = mount_start,
+ .stop = mount_stop,
+ .reload = mount_reload,
+
+ .kill = mount_kill,
+ .clean = mount_clean,
+ .can_clean = mount_can_clean,
+
+ .serialize = mount_serialize,
+ .deserialize_item = mount_deserialize_item,
+
+ .active_state = mount_active_state,
+ .sub_state_to_string = mount_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = mount_may_gc,
+ .is_extrinsic = mount_is_extrinsic,
+
+ .sigchld_event = mount_sigchld_event,
+
+ .reset_failed = mount_reset_failed,
+
+ .control_pid = mount_control_pid,
+
+ .bus_set_property = bus_mount_set_property,
+ .bus_commit_properties = bus_mount_commit_properties,
+
+ .get_timeout = mount_get_timeout,
+
+ .enumerate_perpetual = mount_enumerate_perpetual,
+ .enumerate = mount_enumerate,
+ .shutdown = mount_shutdown,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Mounting %s...",
+ [1] = "Unmounting %s...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Mounted %s.",
+ [JOB_FAILED] = "Failed to mount %s.",
+ [JOB_TIMEOUT] = "Timed out mounting %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Unmounted %s.",
+ [JOB_FAILED] = "Failed unmounting %s.",
+ [JOB_TIMEOUT] = "Timed out unmounting %s.",
+ },
+ },
+
+ .can_start = mount_can_start,
+};
diff --git a/src/core/mount.h b/src/core/mount.h
new file mode 100644
index 0000000..1a0d9fc
--- /dev/null
+++ b/src/core/mount.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Mount Mount;
+
+#include "kill.h"
+#include "dynamic-user.h"
+#include "unit.h"
+
+typedef enum MountExecCommand {
+ MOUNT_EXEC_MOUNT,
+ MOUNT_EXEC_UNMOUNT,
+ MOUNT_EXEC_REMOUNT,
+ _MOUNT_EXEC_COMMAND_MAX,
+ _MOUNT_EXEC_COMMAND_INVALID = -EINVAL,
+} MountExecCommand;
+
+typedef enum MountResult {
+ MOUNT_SUCCESS,
+ MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+ MOUNT_FAILURE_TIMEOUT,
+ MOUNT_FAILURE_EXIT_CODE,
+ MOUNT_FAILURE_SIGNAL,
+ MOUNT_FAILURE_CORE_DUMP,
+ MOUNT_FAILURE_START_LIMIT_HIT,
+ MOUNT_FAILURE_PROTOCOL,
+ _MOUNT_RESULT_MAX,
+ _MOUNT_RESULT_INVALID = -EINVAL,
+} MountResult;
+
+typedef struct MountParameters {
+ char *what;
+ char *options;
+ char *fstype;
+} MountParameters;
+
+/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */
+typedef enum MountProcFlags {
+ MOUNT_PROC_IS_MOUNTED = 1 << 0,
+ MOUNT_PROC_JUST_MOUNTED = 1 << 1,
+ MOUNT_PROC_JUST_CHANGED = 1 << 2,
+} MountProcFlags;
+
+struct Mount {
+ Unit meta;
+
+ char *where;
+
+ MountParameters parameters_proc_self_mountinfo;
+ MountParameters parameters_fragment;
+
+ bool from_proc_self_mountinfo:1;
+ bool from_fragment:1;
+
+ MountProcFlags proc_flags;
+
+ bool sloppy_options;
+
+ bool lazy_unmount;
+ bool force_unmount;
+
+ bool read_write_only;
+
+ MountResult result;
+ MountResult reload_result;
+ MountResult clean_result;
+
+ mode_t directory_mode;
+
+ usec_t timeout_usec;
+
+ ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX];
+
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+ DynamicCreds dynamic_creds;
+
+ MountState state, deserialized_state;
+
+ ExecCommand* control_command;
+ MountExecCommand control_command_id;
+ pid_t control_pid;
+
+ sd_event_source *timer_event_source;
+
+ unsigned n_retry_umount;
+};
+
+extern const UnitVTable mount_vtable;
+
+void mount_fd_event(Manager *m, int events);
+
+const char* mount_exec_command_to_string(MountExecCommand i) _const_;
+MountExecCommand mount_exec_command_from_string(const char *s) _pure_;
+
+const char* mount_result_to_string(MountResult i) _const_;
+MountResult mount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(MOUNT, Mount);
diff --git a/src/core/namespace.c b/src/core/namespace.c
new file mode 100644
index 0000000..6870f70
--- /dev/null
+++ b/src/core/namespace.c
@@ -0,0 +1,3018 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <linux/loop.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#if WANT_LINUX_FS_H
+#include <linux/fs.h>
+#endif
+
+#include "alloc-util.h"
+#include "base-filesystem.h"
+#include "chase-symlinks.h"
+#include "dev-setup.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "extension-release.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "glyph-util.h"
+#include "label.h"
+#include "list.h"
+#include "loop-util.h"
+#include "loopback-setup.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
+
+typedef enum MountMode {
+ /* This is ordered by priority! */
+ INACCESSIBLE,
+ OVERLAY_MOUNT,
+ MOUNT_IMAGES,
+ BIND_MOUNT,
+ BIND_MOUNT_RECURSIVE,
+ PRIVATE_TMP,
+ PRIVATE_TMP_READONLY,
+ PRIVATE_DEV,
+ BIND_DEV,
+ EMPTY_DIR,
+ SYSFS,
+ PROCFS,
+ READONLY,
+ READWRITE,
+ NOEXEC,
+ EXEC,
+ TMPFS,
+ RUN,
+ EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
+ EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
+ MQUEUEFS,
+ READWRITE_IMPLICIT, /* Should have the lowest priority. */
+ _MOUNT_MODE_MAX,
+} MountMode;
+
+typedef struct MountEntry {
+ const char *path_const; /* Memory allocated on stack or static */
+ MountMode mode:5;
+ bool ignore:1; /* Ignore if path does not exist? */
+ bool has_prefix:1; /* Already is prefixed by the root dir? */
+ bool read_only:1; /* Shall this mount point be read-only? */
+ bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
+ bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */
+ bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */
+ bool applied:1; /* Already applied */
+ char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
+ const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
+ char *unprefixed_path_malloc;
+ const char *source_const; /* The source path, for bind mounts or images */
+ char *source_malloc;
+ const char *options_const;/* Mount options for tmpfs */
+ char *options_malloc;
+ unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
+ unsigned n_followed;
+ LIST_HEAD(MountOptions, image_options);
+} MountEntry;
+
+/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+ { "/proc", PROCFS, false },
+ { "/dev", BIND_DEV, false },
+ { "/sys", SYSFS, false },
+ { "/run", RUN, false, .options_const = "mode=755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const MountEntry protect_kernel_tunables_proc_table[] = {
+ { "/proc/acpi", READONLY, true },
+ { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
+ { "/proc/asound", READONLY, true },
+ { "/proc/bus", READONLY, true },
+ { "/proc/fs", READONLY, true },
+ { "/proc/irq", READONLY, true },
+ { "/proc/kallsyms", INACCESSIBLE, true },
+ { "/proc/kcore", INACCESSIBLE, true },
+ { "/proc/latency_stats", READONLY, true },
+ { "/proc/mtrr", READONLY, true },
+ { "/proc/scsi", READONLY, true },
+ { "/proc/sys", READONLY, true },
+ { "/proc/sysrq-trigger", READONLY, true },
+ { "/proc/timer_stats", READONLY, true },
+};
+
+static const MountEntry protect_kernel_tunables_sys_table[] = {
+ { "/sys", READONLY, false },
+ { "/sys/fs/bpf", READONLY, true },
+ { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
+ { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
+ { "/sys/kernel/debug", READONLY, true },
+ { "/sys/kernel/tracing", READONLY, true },
+};
+
+/* ProtectKernelModules= option */
+static const MountEntry protect_kernel_modules_table[] = {
+#if HAVE_SPLIT_USR
+ { "/lib/modules", INACCESSIBLE, true },
+#endif
+ { "/usr/lib/modules", INACCESSIBLE, true },
+};
+
+/* ProtectKernelLogs= option */
+static const MountEntry protect_kernel_logs_proc_table[] = {
+ { "/proc/kmsg", INACCESSIBLE, true },
+};
+
+static const MountEntry protect_kernel_logs_dev_table[] = {
+ { "/dev/kmsg", INACCESSIBLE, true },
+};
+
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const MountEntry protect_home_read_only_table[] = {
+ { "/home", READONLY, true },
+ { "/run/user", READONLY, true },
+ { "/root", READONLY, true },
+};
+
+/* ProtectHome=tmpfs table */
+static const MountEntry protect_home_tmpfs_table[] = {
+ { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+ { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+ { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectHome=yes table */
+static const MountEntry protect_home_yes_table[] = {
+ { "/home", INACCESSIBLE, true },
+ { "/run/user", INACCESSIBLE, true },
+ { "/root", INACCESSIBLE, true },
+};
+
+/* ProtectSystem=yes table */
+static const MountEntry protect_system_yes_table[] = {
+ { "/usr", READONLY, false },
+ { "/boot", READONLY, true },
+ { "/efi", READONLY, true },
+#if HAVE_SPLIT_USR
+ { "/lib", READONLY, true },
+ { "/lib64", READONLY, true },
+ { "/bin", READONLY, true },
+# if HAVE_SPLIT_BIN
+ { "/sbin", READONLY, true },
+# endif
+#endif
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const MountEntry protect_system_full_table[] = {
+ { "/usr", READONLY, false },
+ { "/boot", READONLY, true },
+ { "/efi", READONLY, true },
+ { "/etc", READONLY, false },
+#if HAVE_SPLIT_USR
+ { "/lib", READONLY, true },
+ { "/lib64", READONLY, true },
+ { "/bin", READONLY, true },
+# if HAVE_SPLIT_BIN
+ { "/sbin", READONLY, true },
+# endif
+#endif
+};
+
+/*
+ * ProtectSystem=strict table. In this strict mode, we mount everything
+ * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
+ * which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal.
+ * (And of course /home and friends are also left writable, as ProtectHome=
+ * shall manage those, orthogonally).
+ */
+static const MountEntry protect_system_strict_table[] = {
+ { "/", READONLY, false },
+ { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
+ { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
+ { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
+ { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
+ { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
+ { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
+};
+
+static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
+ [INACCESSIBLE] = "inaccessible",
+ [OVERLAY_MOUNT] = "overlay",
+ [BIND_MOUNT] = "bind",
+ [BIND_MOUNT_RECURSIVE] = "rbind",
+ [PRIVATE_TMP] = "private-tmp",
+ [PRIVATE_DEV] = "private-dev",
+ [BIND_DEV] = "bind-dev",
+ [EMPTY_DIR] = "empty",
+ [SYSFS] = "sysfs",
+ [PROCFS] = "procfs",
+ [READONLY] = "read-only",
+ [READWRITE] = "read-write",
+ [TMPFS] = "tmpfs",
+ [MOUNT_IMAGES] = "mount-images",
+ [READWRITE_IMPLICIT] = "rw-implicit",
+ [EXEC] = "exec",
+ [NOEXEC] = "noexec",
+ [MQUEUEFS] = "mqueuefs",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
+
+static const char *mount_entry_path(const MountEntry *p) {
+ assert(p);
+
+ /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
+ * otherwise the stack/static ->path field is returned. */
+
+ return p->path_malloc ?: p->path_const;
+}
+
+static const char *mount_entry_unprefixed_path(const MountEntry *p) {
+ assert(p);
+
+ /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
+
+ return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
+}
+
+static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
+ assert(p);
+ assert(p->path_malloc || p->path_const);
+ assert(new_path);
+
+ /* Saves current path in unprefixed_ variable, and takes over new_path */
+
+ free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
+ /* If we didn't have a path on the heap, then it's a static one */
+ if (!p->unprefixed_path_malloc)
+ p->unprefixed_path_const = p->path_const;
+ p->path_malloc = new_path;
+ p->has_prefix = true;
+}
+
+static bool mount_entry_read_only(const MountEntry *p) {
+ assert(p);
+
+ return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE, PRIVATE_TMP_READONLY);
+}
+
+static bool mount_entry_noexec(const MountEntry *p) {
+ assert(p);
+
+ return p->noexec || IN_SET(p->mode, NOEXEC, INACCESSIBLE, SYSFS, PROCFS);
+}
+
+static bool mount_entry_exec(const MountEntry *p) {
+ assert(p);
+
+ return p->exec || p->mode == EXEC;
+}
+
+static const char *mount_entry_source(const MountEntry *p) {
+ assert(p);
+
+ return p->source_malloc ?: p->source_const;
+}
+
+static const char *mount_entry_options(const MountEntry *p) {
+ assert(p);
+
+ return p->options_malloc ?: p->options_const;
+}
+
+static void mount_entry_done(MountEntry *p) {
+ assert(p);
+
+ p->path_malloc = mfree(p->path_malloc);
+ p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
+ p->source_malloc = mfree(p->source_malloc);
+ p->options_malloc = mfree(p->options_malloc);
+ p->image_options = mount_options_free_all(p->image_options);
+}
+
+static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
+ assert(p);
+
+ /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
+
+ STRV_FOREACH(i, strv) {
+ bool ignore = false, needs_prefix = false;
+ const char *e = *i;
+
+ /* Look for any prefixes */
+ if (startswith(e, "-")) {
+ e++;
+ ignore = true;
+ }
+ if (startswith(e, "+")) {
+ e++;
+ needs_prefix = true;
+ }
+
+ if (!path_is_absolute(e))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Path is not absolute: %s", e);
+
+ *((*p)++) = (MountEntry) {
+ .path_const = e,
+ .mode = mode,
+ .ignore = ignore,
+ .has_prefix = !needs_prefix && !forcibly_require_prefix,
+ };
+ }
+
+ return 0;
+}
+
+static int append_empty_dir_mounts(MountEntry **p, char **strv) {
+ assert(p);
+
+ /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
+ * "/private/" boundary directories for DynamicUser=1. */
+
+ STRV_FOREACH(i, strv) {
+
+ *((*p)++) = (MountEntry) {
+ .path_const = *i,
+ .mode = EMPTY_DIR,
+ .ignore = false,
+ .read_only = true,
+ .options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+ .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
+ };
+ }
+
+ return 0;
+}
+
+static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
+ assert(p);
+
+ for (size_t i = 0; i < n; i++) {
+ const BindMount *b = binds + i;
+
+ *((*p)++) = (MountEntry) {
+ .path_const = b->destination,
+ .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
+ .read_only = b->read_only,
+ .nosuid = b->nosuid,
+ .source_const = b->source,
+ .ignore = b->ignore_enoent,
+ };
+ }
+
+ return 0;
+}
+
+static int append_mount_images(MountEntry **p, const MountImage *mount_images, size_t n) {
+ assert(p);
+
+ for (size_t i = 0; i < n; i++) {
+ const MountImage *m = mount_images + i;
+
+ *((*p)++) = (MountEntry) {
+ .path_const = m->destination,
+ .mode = MOUNT_IMAGES,
+ .source_const = m->source,
+ .image_options = m->mount_options,
+ .ignore = m->ignore_enoent,
+ };
+ }
+
+ return 0;
+}
+
+static int append_extensions(
+ MountEntry **p,
+ const char *root,
+ const char *extension_dir,
+ char **hierarchies,
+ const MountImage *mount_images,
+ size_t n,
+ char **extension_directories) {
+
+ _cleanup_strv_free_ char **overlays = NULL;
+ int r;
+
+ if (n == 0 && strv_isempty(extension_directories))
+ return 0;
+
+ assert(p);
+ assert(extension_dir);
+
+ /* Prepare a list of overlays, that will have as each element a string suitable for being
+ * passed as a lowerdir= parameter, so start with the hierarchy on the root.
+ * The overlays vector will have the same number of elements and will correspond to the
+ * hierarchies vector, so they can be iterated upon together. */
+ STRV_FOREACH(hierarchy, hierarchies) {
+ _cleanup_free_ char *prefixed_hierarchy = NULL;
+
+ prefixed_hierarchy = path_join(root, *hierarchy);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ r = strv_consume(&overlays, TAKE_PTR(prefixed_hierarchy));
+ if (r < 0)
+ return r;
+ }
+
+ /* First, prepare a mount for each image, but these won't be visible to the unit, instead
+ * they will be mounted in our propagate directory, and used as a source for the overlay. */
+ for (size_t i = 0; i < n; i++) {
+ _cleanup_free_ char *mount_point = NULL;
+ const MountImage *m = mount_images + i;
+
+ r = asprintf(&mount_point, "%s/%zu", extension_dir, i);
+ if (r < 0)
+ return -ENOMEM;
+
+ for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+ _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
+
+ prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ escaped = shell_escape(prefixed_hierarchy, ",:");
+ if (!escaped)
+ return -ENOMEM;
+
+ /* Note that lowerdir= parameters are in 'reverse' order, so the
+ * top-most directory in the overlay comes first in the list. */
+ lowerdir = strjoin(escaped, ":", overlays[j]);
+ if (!lowerdir)
+ return -ENOMEM;
+
+ free_and_replace(overlays[j], lowerdir);
+ }
+
+ *((*p)++) = (MountEntry) {
+ .path_malloc = TAKE_PTR(mount_point),
+ .image_options = m->mount_options,
+ .ignore = m->ignore_enoent,
+ .source_const = m->source,
+ .mode = EXTENSION_IMAGES,
+ .has_prefix = true,
+ };
+ }
+
+ /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
+ * Bind mount them in the same location as the ExtensionImages, so that we
+ * can check that they are valid trees (extension-release.d). */
+ STRV_FOREACH(extension_directory, extension_directories) {
+ _cleanup_free_ char *mount_point = NULL, *source = NULL;
+ const char *e = *extension_directory;
+ bool ignore_enoent = false;
+
+ /* Pick up the counter where the ExtensionImages left it. */
+ r = asprintf(&mount_point, "%s/%zu", extension_dir, n++);
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Look for any prefixes */
+ if (startswith(e, "-")) {
+ e++;
+ ignore_enoent = true;
+ }
+ /* Ignore this for now */
+ if (startswith(e, "+"))
+ e++;
+
+ source = strdup(e);
+ if (!source)
+ return -ENOMEM;
+
+ for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+ _cleanup_free_ char *prefixed_hierarchy = NULL, *escaped = NULL, *lowerdir = NULL;
+
+ prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ escaped = shell_escape(prefixed_hierarchy, ",:");
+ if (!escaped)
+ return -ENOMEM;
+
+ /* Note that lowerdir= parameters are in 'reverse' order, so the
+ * top-most directory in the overlay comes first in the list. */
+ lowerdir = strjoin(escaped, ":", overlays[j]);
+ if (!lowerdir)
+ return -ENOMEM;
+
+ free_and_replace(overlays[j], lowerdir);
+ }
+
+ *((*p)++) = (MountEntry) {
+ .path_malloc = TAKE_PTR(mount_point),
+ .source_malloc = TAKE_PTR(source),
+ .mode = EXTENSION_DIRECTORIES,
+ .ignore = ignore_enoent,
+ .has_prefix = true,
+ .read_only = true,
+ };
+ }
+
+ /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
+ * set up earlier. */
+ for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
+ _cleanup_free_ char *prefixed_hierarchy = NULL;
+
+ prefixed_hierarchy = path_join(root, hierarchies[i]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ *((*p)++) = (MountEntry) {
+ .path_malloc = TAKE_PTR(prefixed_hierarchy),
+ .options_malloc = TAKE_PTR(overlays[i]),
+ .mode = OVERLAY_MOUNT,
+ .has_prefix = true,
+ .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
+ };
+ }
+
+ return 0;
+}
+
+static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
+ assert(p);
+
+ for (size_t i = 0; i < n; i++) {
+ const TemporaryFileSystem *t = tmpfs + i;
+ _cleanup_free_ char *o = NULL, *str = NULL;
+ unsigned long flags;
+ bool ro = false;
+ int r;
+
+ if (!path_is_absolute(t->path))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Path is not absolute: %s",
+ t->path);
+
+ str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
+ if (!str)
+ return -ENOMEM;
+
+ r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
+
+ ro = flags & MS_RDONLY;
+ if (ro)
+ flags ^= MS_RDONLY;
+
+ *((*p)++) = (MountEntry) {
+ .path_const = t->path,
+ .mode = TMPFS,
+ .read_only = ro,
+ .options_malloc = TAKE_PTR(o),
+ .flags = flags,
+ };
+ }
+
+ return 0;
+}
+
+static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
+ assert(p);
+ assert(mounts);
+
+ /* Adds a list of static pre-defined entries */
+
+ for (size_t i = 0; i < n; i++)
+ *((*p)++) = (MountEntry) {
+ .path_const = mount_entry_path(mounts+i),
+ .mode = mounts[i].mode,
+ .ignore = mounts[i].ignore || ignore_protect,
+ };
+
+ return 0;
+}
+
+static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
+ assert(p);
+
+ switch (protect_home) {
+
+ case PROTECT_HOME_NO:
+ return 0;
+
+ case PROTECT_HOME_READ_ONLY:
+ return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
+
+ case PROTECT_HOME_TMPFS:
+ return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
+
+ case PROTECT_HOME_YES:
+ return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
+ assert(p);
+
+ switch (protect_system) {
+
+ case PROTECT_SYSTEM_NO:
+ return 0;
+
+ case PROTECT_SYSTEM_STRICT:
+ return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
+
+ case PROTECT_SYSTEM_YES:
+ return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
+
+ case PROTECT_SYSTEM_FULL:
+ return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
+ int d;
+
+ /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
+ * regardless of the prefix - they are set up in the propagate directory anyway */
+ d = -CMP(a->mode == EXTENSION_IMAGES, b->mode == EXTENSION_IMAGES);
+ if (d != 0)
+ return d;
+ d = -CMP(a->mode == EXTENSION_DIRECTORIES, b->mode == EXTENSION_DIRECTORIES);
+ if (d != 0)
+ return d;
+
+ /* If the paths are not equal, then order prefixes first */
+ d = path_compare(mount_entry_path(a), mount_entry_path(b));
+ if (d != 0)
+ return d;
+
+ /* If the paths are equal, check the mode */
+ return CMP((int) a->mode, (int) b->mode);
+}
+
+static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
+ /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
+
+ assert(m || n == 0);
+
+ for (size_t i = 0; i < n; i++) {
+ char *s;
+
+ if (m[i].has_prefix)
+ continue;
+
+ s = path_join(root_directory, mount_entry_path(m+i));
+ if (!s)
+ return -ENOMEM;
+
+ mount_entry_consume_prefix(&m[i], s);
+ }
+
+ return 0;
+}
+
+static void drop_duplicates(MountEntry *m, size_t *n) {
+ MountEntry *f, *t, *previous;
+
+ assert(m);
+ assert(n);
+
+ /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
+ for (f = m, t = m, previous = NULL; f < m + *n; f++) {
+
+ /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+ * above. Note that we only drop duplicates that haven't been mounted yet. */
+ if (previous &&
+ path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
+ !f->applied && !previous->applied) {
+ log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
+ /* Propagate the flags to the remaining entry */
+ previous->read_only = previous->read_only || mount_entry_read_only(f);
+ previous->noexec = previous->noexec || mount_entry_noexec(f);
+ previous->exec = previous->exec || mount_entry_exec(f);
+ mount_entry_done(f);
+ continue;
+ }
+
+ *t = *f;
+ previous = t;
+ t++;
+ }
+
+ *n = t - m;
+}
+
+static void drop_inaccessible(MountEntry *m, size_t *n) {
+ MountEntry *f, *t;
+ const char *clear = NULL;
+
+ assert(m);
+ assert(n);
+
+ /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+ * ordered already. */
+
+ for (f = m, t = m; f < m + *n; f++) {
+
+ /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+ * it, as inaccessible paths really should drop the entire subtree. */
+ if (clear && path_startswith(mount_entry_path(f), clear)) {
+ log_debug("%s is masked by %s.", mount_entry_path(f), clear);
+ mount_entry_done(f);
+ continue;
+ }
+
+ clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
+
+ *t = *f;
+ t++;
+ }
+
+ *n = t - m;
+}
+
+static void drop_nop(MountEntry *m, size_t *n) {
+ MountEntry *f, *t;
+
+ assert(m);
+ assert(n);
+
+ /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+ * list is ordered by prefixes. */
+
+ for (f = m, t = m; f < m + *n; f++) {
+
+ /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
+ if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
+ MountEntry *found = NULL;
+
+ /* Now let's find the first parent of the entry we are looking at. */
+ for (MountEntry *p = PTR_SUB1(t, m); p; p = PTR_SUB1(p, m))
+ if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
+ found = p;
+ break;
+ }
+
+ /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+ if (found && found->mode == f->mode) {
+ log_debug("%s (%s) is made redundant by %s (%s)",
+ mount_entry_path(f), mount_mode_to_string(f->mode),
+ mount_entry_path(found), mount_mode_to_string(found->mode));
+ mount_entry_done(f);
+ continue;
+ }
+ }
+
+ *t = *f;
+ t++;
+ }
+
+ *n = t - m;
+}
+
+static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
+ MountEntry *f, *t;
+
+ assert(m);
+ assert(n);
+
+ /* Nothing to do */
+ if (!root_directory)
+ return;
+
+ /* Drops all mounts that are outside of the root directory. */
+
+ for (f = m, t = m; f < m + *n; f++) {
+
+ /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
+ if (!IN_SET(f->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) && !path_startswith(mount_entry_path(f), root_directory)) {
+ log_debug("%s is outside of root directory.", mount_entry_path(f));
+ mount_entry_done(f);
+ continue;
+ }
+
+ *t = *f;
+ t++;
+ }
+
+ *n = t - m;
+}
+
+static int clone_device_node(
+ const char *d,
+ const char *temporary_mount,
+ bool *make_devnode) {
+
+ _cleanup_free_ char *sl = NULL;
+ const char *dn, *bn, *t;
+ struct stat st;
+ int r;
+
+ if (stat(d, &st) < 0) {
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
+ return -ENXIO;
+ }
+
+ return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
+ }
+
+ if (!S_ISBLK(st.st_mode) &&
+ !S_ISCHR(st.st_mode))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Device node '%s' to clone is not a device node, ignoring.",
+ d);
+
+ dn = strjoina(temporary_mount, d);
+
+ /* First, try to create device node properly */
+ if (*make_devnode) {
+ mac_selinux_create_file_prepare(d, st.st_mode);
+ r = mknod(dn, st.st_mode, st.st_rdev);
+ mac_selinux_create_file_clear();
+ if (r >= 0)
+ goto add_symlink;
+ if (errno != EPERM)
+ return log_debug_errno(errno, "mknod failed for %s: %m", d);
+
+ /* This didn't work, let's not try this again for the next iterations. */
+ *make_devnode = false;
+ }
+
+ /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
+ * Do not prepare device-node SELinux label (see issue 13762) */
+ r = mknod(dn, S_IFREG, 0);
+ if (r < 0 && errno != EEXIST)
+ return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
+
+ /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
+ * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
+ * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
+ r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+
+add_symlink:
+ bn = path_startswith(d, "/dev/");
+ if (!bn)
+ return 0;
+
+ /* Create symlinks like /dev/char/1:9 → ../urandom */
+ if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
+ temporary_mount,
+ S_ISCHR(st.st_mode) ? "char" : "block",
+ DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+ return log_oom();
+
+ (void) mkdir_parents(sl, 0755);
+
+ t = strjoina("../", bn);
+ if (symlink(t, sl) < 0)
+ log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
+
+ return 0;
+}
+
+static int mount_private_dev(MountEntry *m) {
+ static const char devnodes[] =
+ "/dev/null\0"
+ "/dev/zero\0"
+ "/dev/full\0"
+ "/dev/random\0"
+ "/dev/urandom\0"
+ "/dev/tty\0";
+
+ char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
+ const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
+ bool can_mknod = true;
+ int r;
+
+ assert(m);
+
+ if (!mkdtemp(temporary_mount))
+ return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
+
+ dev = strjoina(temporary_mount, "/dev");
+ (void) mkdir(dev, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_PRIVATE_DEV);
+ if (r < 0)
+ goto fail;
+
+ r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
+ goto fail;
+ }
+
+ devpts = strjoina(temporary_mount, "/dev/pts");
+ (void) mkdir(devpts, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto fail;
+
+ /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
+ * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
+ * Thus, in that case make a clone.
+ * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
+ r = is_symlink("/dev/ptmx");
+ if (r < 0) {
+ log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
+ goto fail;
+ } else if (r > 0) {
+ devptmx = strjoina(temporary_mount, "/dev/ptmx");
+ if (symlink("pts/ptmx", devptmx) < 0) {
+ r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
+ goto fail;
+ }
+ } else {
+ r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
+ if (r < 0)
+ goto fail;
+ }
+
+ devshm = strjoina(temporary_mount, "/dev/shm");
+ (void) mkdir(devshm, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto fail;
+
+ devmqueue = strjoina(temporary_mount, "/dev/mqueue");
+ (void) mkdir(devmqueue, 0755);
+ (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
+
+ devhugepages = strjoina(temporary_mount, "/dev/hugepages");
+ (void) mkdir(devhugepages, 0755);
+ (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
+
+ devlog = strjoina(temporary_mount, "/dev/log");
+ if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
+ log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
+
+ NULSTR_FOREACH(d, devnodes) {
+ r = clone_device_node(d, temporary_mount, &can_mknod);
+ /* ENXIO means the *source* is not a device file, skip creation in that case */
+ if (r < 0 && r != -ENXIO)
+ goto fail;
+ }
+
+ r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
+
+ /* Create the /dev directory if missing. It is more likely to be missing when the service is started
+ * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ /* Unmount everything in old /dev */
+ r = umount_recursive(mount_entry_path(m), 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
+
+ r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto fail;
+
+ (void) rmdir(dev);
+ (void) rmdir(temporary_mount);
+
+ return 0;
+
+fail:
+ if (devpts)
+ (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
+
+ if (devshm)
+ (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
+
+ if (devhugepages)
+ (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
+
+ if (devmqueue)
+ (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
+
+ (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
+ (void) rmdir(dev);
+ (void) rmdir(temporary_mount);
+
+ return r;
+}
+
+static int mount_bind_dev(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
+ * service's /dev. This is only used when RootDirectory= is set. */
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+ if (r > 0) /* make this a NOP if /dev is already a mount point */
+ return 0;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_sysfs(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+ if (r > 0) /* make this a NOP if /sys is already a mount point */
+ return 0;
+
+ /* Bind mount the host's version so that we get all child mounts of it, too. */
+ r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static bool mount_option_supported(const char *fstype, const char *key, const char *value) {
+ _cleanup_close_ int fd = -1;
+ int r;
+
+ /* This function assumes support by default. Only if the fsconfig() call fails with -EINVAL/-EOPNOTSUPP
+ * will it report that the option/value is not supported. */
+
+ fd = fsopen(fstype, FSOPEN_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOSYS)
+ log_debug_errno(errno, "Failed to open superblock context for '%s': %m", fstype);
+ return true; /* If fsopen() fails for whatever reason, assume the value is supported. */
+ }
+
+ r = fsconfig(fd, FSCONFIG_SET_STRING, key, value, 0);
+ if (r < 0 && !IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS))
+ log_debug_errno(errno, "Failed to set '%s=%s' on '%s' superblock context: %m", key, value, fstype);
+
+ return r >= 0 || !IN_SET(errno, EINVAL, EOPNOTSUPP);
+}
+
+static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
+ _cleanup_free_ char *opts = NULL;
+ const char *entry_path;
+ int r, n;
+
+ assert(m);
+ assert(ns_info);
+
+ if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+ ns_info->proc_subset != PROC_SUBSET_ALL) {
+
+ /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
+ * pretended to be per-instance but actually was per-namespace), hence let's make use of it
+ * if requested. To make sure this logic succeeds only on kernels where hidepid= is
+ * per-instance, we'll exclusively use the textual value for hidepid=, since support was
+ * added in the same commit: if it's supported it is thus also per-instance. */
+
+ const char *hpv = ns_info->protect_proc == PROTECT_PROC_DEFAULT ?
+ "off" :
+ protect_proc_to_string(ns_info->protect_proc);
+
+ /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
+ * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
+ * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
+ * fsopen()/fsconfig() was also backported on some distros which allows us to detect
+ * hidepid=/subset= support in even more scenarios. */
+
+ if (mount_option_supported("proc", "hidepid", hpv)) {
+ opts = strjoin("hidepid=", hpv);
+ if (!opts)
+ return -ENOMEM;
+ }
+
+ if (ns_info->proc_subset == PROC_SUBSET_PID && mount_option_supported("proc", "subset", "pid"))
+ if (!strextend_with_separator(&opts, ",", "subset=pid"))
+ return -ENOMEM;
+ }
+
+ entry_path = mount_entry_path(m);
+ (void) mkdir_p_label(entry_path, 0755);
+
+ /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
+ * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
+ * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
+ * mounted on /proc/ first. */
+
+ n = umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ if (r == -EINVAL && opts)
+ /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
+ * not supported by the kernel, and thus the per-instance hidepid= neither, which
+ * means we really don't want to use it, since it would affect our host's /proc
+ * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
+ r = mount_nofollow_verbose(LOG_DEBUG, "proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r == -EPERM) {
+ /* When we do not have enough privileges to mount /proc, fallback to use existing /proc. */
+
+ if (n > 0)
+ /* /proc or some of sub-mounts are umounted in the above. Refuse incomplete tree.
+ * Propagate the original error code returned by mount() in the above. */
+ return -EPERM;
+
+ r = path_is_mount_point(entry_path, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
+ if (r == 0) {
+ /* We lack permissions to mount a new instance of /proc, and it is not already
+ * mounted. But we can access the host's, so as a final fallback bind-mount it to
+ * the destination, as most likely we are inside a user manager in an unprivileged
+ * user namespace. */
+ r = mount_nofollow_verbose(LOG_DEBUG, "/proc", entry_path, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return -EPERM;
+ }
+ } else if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_tmpfs(const MountEntry *m) {
+ const char *entry_path, *inner_path;
+ int r;
+
+ assert(m);
+
+ entry_path = mount_entry_path(m);
+ inner_path = mount_entry_unprefixed_path(m);
+
+ /* First, get rid of everything that is below if there is anything. Then, overmount with our new
+ * tmpfs */
+
+ (void) mkdir_p_label(entry_path, 0755);
+ (void) umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
+ if (r < 0)
+ return r;
+
+ r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
+
+ return 1;
+}
+
+static int mount_run(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
+ if (r > 0) /* make this a NOP if /run is already a mount point */
+ return 0;
+
+ return mount_tmpfs(m);
+}
+
+static int mount_mqueuefs(const MountEntry *m) {
+ int r;
+ const char *entry_path;
+
+ assert(m);
+
+ entry_path = mount_entry_path(m);
+
+ (void) mkdir_p_label(entry_path, 0755);
+ (void) umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int mount_image(const MountEntry *m, const char *root_directory) {
+
+ _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+ *host_os_release_sysext_level = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->mode == EXTENSION_IMAGES) {
+ r = parse_os_release(
+ empty_to_root(root_directory),
+ "ID", &host_os_release_id,
+ "VERSION_ID", &host_os_release_version_id,
+ "SYSEXT_LEVEL", &host_os_release_sysext_level,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ if (isempty(host_os_release_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ }
+
+ r = verity_dissect_and_mount(
+ /* src_fd= */ -1, mount_entry_source(m), mount_entry_path(m), m->image_options,
+ host_os_release_id, host_os_release_version_id, host_os_release_sysext_level, NULL);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r == -ESTALE && host_os_release_id)
+ return log_error_errno(r,
+ "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s",
+ mount_entry_source(m),
+ host_os_release_id,
+ host_os_release_version_id ? " VERSION_ID=" : "",
+ strempty(host_os_release_version_id),
+ host_os_release_sysext_level ? " SYSEXT_LEVEL=" : "",
+ strempty(host_os_release_sysext_level));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
+
+ return 1;
+}
+
+static int mount_overlay(const MountEntry *m) {
+ const char *options;
+ int r;
+
+ assert(m);
+
+ options = strjoina("lowerdir=", mount_entry_options(m));
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int follow_symlink(
+ const char *root_directory,
+ MountEntry *m) {
+
+ _cleanup_free_ char *target = NULL;
+ int r;
+
+ /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
+ * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
+ * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
+ * end and already have a fully normalized name. */
+
+ r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
+ if (r > 0) /* Reached the end, nothing more to resolve */
+ return 1;
+
+ if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
+ return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+ "Symlink loop on '%s'.",
+ mount_entry_path(m));
+
+ log_debug("Followed mount entry path symlink %s %s %s.",
+ mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target);
+
+ mount_entry_consume_prefix(m, TAKE_PTR(target));
+
+ m->n_followed ++;
+
+ return 0;
+}
+
+static int apply_one_mount(
+ const char *root_directory,
+ MountEntry *m,
+ const NamespaceInfo *ns_info) {
+
+ _cleanup_free_ char *inaccessible = NULL;
+ bool rbind = true, make = false;
+ const char *what;
+ int r;
+
+ assert(m);
+ assert(ns_info);
+
+ log_debug("Applying namespace mount on %s", mount_entry_path(m));
+
+ switch (m->mode) {
+
+ case INACCESSIBLE: {
+ _cleanup_free_ char *tmp = NULL;
+ const char *runtime_dir;
+ struct stat target;
+
+ /* First, get rid of everything that is below if there
+ * is anything... Then, overmount it with an
+ * inaccessible path. */
+ (void) umount_recursive(mount_entry_path(m), 0);
+
+ if (lstat(mount_entry_path(m), &target) < 0) {
+ if (errno == ENOENT && m->ignore)
+ return 0;
+
+ return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
+ mount_entry_path(m));
+ }
+
+ if (geteuid() == 0)
+ runtime_dir = "/run";
+ else {
+ if (asprintf(&tmp, "/run/user/" UID_FMT, geteuid()) < 0)
+ return -ENOMEM;
+
+ runtime_dir = tmp;
+ }
+
+ r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+ "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
+ what = inaccessible;
+ break;
+ }
+
+ case READONLY:
+ case READWRITE:
+ case READWRITE_IMPLICIT:
+ case EXEC:
+ case NOEXEC:
+ r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
+ mount_entry_path(m));
+ if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
+ * and MS_NOEXEC bits for the mount point if needed. */
+ return 0;
+ /* This isn't a mount point yet, let's make it one. */
+ what = mount_entry_path(m);
+ break;
+
+ case EXTENSION_DIRECTORIES: {
+ _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+ *host_os_release_sysext_level = NULL, *extension_name = NULL;
+ _cleanup_strv_free_ char **extension_release = NULL;
+
+ r = path_extract_filename(mount_entry_source(m), &extension_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
+ r = parse_os_release(
+ empty_to_root(root_directory),
+ "ID", &host_os_release_id,
+ "VERSION_ID", &host_os_release_version_id,
+ "SYSEXT_LEVEL", &host_os_release_sysext_level,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ if (isempty(host_os_release_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+
+ r = load_extension_release_pairs(mount_entry_source(m), extension_name, /* relax_extension_release_check= */ false, &extension_release);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name);
+
+ r = extension_release_validate(
+ extension_name,
+ host_os_release_id,
+ host_os_release_version_id,
+ host_os_release_sysext_level,
+ /* host_sysext_scope */ NULL, /* Leave empty, we need to accept both system and portable */
+ extension_release);
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
+
+ _fallthrough_;
+ }
+
+ case BIND_MOUNT:
+ rbind = false;
+
+ _fallthrough_;
+ case BIND_MOUNT_RECURSIVE: {
+ _cleanup_free_ char *chased = NULL;
+
+ /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
+ * that bind mount source paths are always relative to the host root, hence we pass NULL as
+ * root directory to chase_symlinks() here. */
+
+ r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
+ if (r == -ENOENT && m->ignore) {
+ log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
+ return 0;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
+
+ log_debug("Followed source symlinks %s %s %s.",
+ mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased);
+
+ free_and_replace(m->source_malloc, chased);
+
+ what = mount_entry_source(m);
+ make = true;
+ break;
+ }
+
+ case EMPTY_DIR:
+ case TMPFS:
+ return mount_tmpfs(m);
+
+ case PRIVATE_TMP:
+ case PRIVATE_TMP_READONLY:
+ what = mount_entry_source(m);
+ make = true;
+ break;
+
+ case PRIVATE_DEV:
+ return mount_private_dev(m);
+
+ case BIND_DEV:
+ return mount_bind_dev(m);
+
+ case SYSFS:
+ return mount_sysfs(m);
+
+ case PROCFS:
+ return mount_procfs(m, ns_info);
+
+ case RUN:
+ return mount_run(m);
+
+ case MQUEUEFS:
+ return mount_mqueuefs(m);
+
+ case MOUNT_IMAGES:
+ return mount_image(m, NULL);
+
+ case EXTENSION_IMAGES:
+ return mount_image(m, root_directory);
+
+ case OVERLAY_MOUNT:
+ return mount_overlay(m);
+
+ default:
+ assert_not_reached();
+ }
+
+ assert(what);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+ if (r < 0) {
+ bool try_again = false;
+
+ if (r == -ENOENT && make) {
+ int q;
+
+ /* Hmm, either the source or the destination are missing. Let's see if we can create
+ the destination, then try again. */
+
+ (void) mkdir_parents(mount_entry_path(m), 0755);
+
+ q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
+ if (q < 0 && q != -EEXIST)
+ log_error_errno(q, "Failed to create destination mount point node '%s': %m",
+ mount_entry_path(m));
+ else
+ try_again = true;
+ }
+
+ if (try_again)
+ r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
+ }
+
+ log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
+ return 0;
+}
+
+static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+ unsigned long new_flags = 0, flags_mask = 0;
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
+ new_flags |= MS_RDONLY;
+ flags_mask |= MS_RDONLY;
+ }
+
+ if (m->nosuid) {
+ new_flags |= MS_NOSUID;
+ flags_mask |= MS_NOSUID;
+ }
+
+ if (flags_mask == 0) /* No Change? */
+ return 0;
+
+ /* We generally apply these changes recursively, except for /dev, and the cases we know there's
+ * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
+ * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
+ * and running Linux <= 4.17. */
+ submounts =
+ mount_entry_read_only(m) &&
+ !IN_SET(m->mode, EMPTY_DIR, TMPFS);
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+ /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
+ * read-only already stays this way. This improves compatibility with container managers, where we
+ * won't attempt to undo read-only mounts already applied. */
+
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+ unsigned long new_flags = 0, flags_mask = 0;
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ if (mount_entry_noexec(m)) {
+ new_flags |= MS_NOEXEC;
+ flags_mask |= MS_NOEXEC;
+ } else if (mount_entry_exec(m)) {
+ new_flags &= ~MS_NOEXEC;
+ flags_mask |= MS_NOEXEC;
+ }
+
+ if (flags_mask == 0) /* No Change? */
+ return 0;
+
+ submounts = !IN_SET(m->mode, EMPTY_DIR, TMPFS);
+
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ submounts = !IN_SET(m->mode, EMPTY_DIR, TMPFS);
+
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
+ assert(ns_info);
+
+ /*
+ * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
+ * since to protect the API VFS mounts, they need to be around in the
+ * first place...
+ */
+
+ return ns_info->mount_apivfs ||
+ ns_info->protect_control_groups ||
+ ns_info->protect_kernel_tunables ||
+ ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
+ ns_info->proc_subset != PROC_SUBSET_ALL;
+}
+
+static size_t namespace_calculate_mounts(
+ const NamespaceInfo *ns_info,
+ char** read_write_paths,
+ char** read_only_paths,
+ char** inaccessible_paths,
+ char** exec_paths,
+ char** no_exec_paths,
+ char** empty_directories,
+ size_t n_bind_mounts,
+ size_t n_temporary_filesystems,
+ size_t n_mount_images,
+ size_t n_extension_images,
+ size_t n_extension_directories,
+ size_t n_hierarchies,
+ const char* tmp_dir,
+ const char* var_tmp_dir,
+ const char *creds_path,
+ const char* log_namespace,
+ bool setup_propagate,
+ const char* notify_socket) {
+
+ size_t protect_home_cnt;
+ size_t protect_system_cnt =
+ (ns_info->protect_system == PROTECT_SYSTEM_STRICT ?
+ ELEMENTSOF(protect_system_strict_table) :
+ ((ns_info->protect_system == PROTECT_SYSTEM_FULL) ?
+ ELEMENTSOF(protect_system_full_table) :
+ ((ns_info->protect_system == PROTECT_SYSTEM_YES) ?
+ ELEMENTSOF(protect_system_yes_table) : 0)));
+
+ protect_home_cnt =
+ (ns_info->protect_home == PROTECT_HOME_YES ?
+ ELEMENTSOF(protect_home_yes_table) :
+ ((ns_info->protect_home == PROTECT_HOME_READ_ONLY) ?
+ ELEMENTSOF(protect_home_read_only_table) :
+ ((ns_info->protect_home == PROTECT_HOME_TMPFS) ?
+ ELEMENTSOF(protect_home_tmpfs_table) : 0)));
+
+ return !!tmp_dir + !!var_tmp_dir +
+ strv_length(read_write_paths) +
+ strv_length(read_only_paths) +
+ strv_length(inaccessible_paths) +
+ strv_length(exec_paths) +
+ strv_length(no_exec_paths) +
+ strv_length(empty_directories) +
+ n_bind_mounts +
+ n_mount_images +
+ (n_extension_images > 0 || n_extension_directories > 0 ? /* Mount each image and directory plus an overlay per hierarchy */
+ n_hierarchies + n_extension_images + n_extension_directories: 0) +
+ n_temporary_filesystems +
+ ns_info->private_dev +
+ (ns_info->protect_kernel_tunables ?
+ ELEMENTSOF(protect_kernel_tunables_proc_table) + ELEMENTSOF(protect_kernel_tunables_sys_table) : 0) +
+ (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
+ (ns_info->protect_kernel_logs ?
+ ELEMENTSOF(protect_kernel_logs_proc_table) + ELEMENTSOF(protect_kernel_logs_dev_table) : 0) +
+ (ns_info->protect_control_groups ? 1 : 0) +
+ protect_home_cnt + protect_system_cnt +
+ (ns_info->protect_hostname ? 2 : 0) +
+ (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0) +
+ (creds_path ? 2 : 1) +
+ !!log_namespace +
+ setup_propagate + /* /run/systemd/incoming */
+ !!notify_socket +
+ ns_info->private_ipc; /* /dev/mqueue */
+}
+
+/* Walk all mount entries and dropping any unused mounts. This affects all
+ * mounts:
+ * - that are implicitly protected by a path that has been rendered inaccessible
+ * - whose immediate parent requests the same protection mode as the mount itself
+ * - that are outside of the relevant root directory
+ * - which are duplicates
+ */
+static void drop_unused_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
+ assert(root_directory);
+ assert(n_mounts);
+ assert(mounts || *n_mounts == 0);
+
+ typesafe_qsort(mounts, *n_mounts, mount_path_compare);
+
+ drop_duplicates(mounts, n_mounts);
+ drop_outside_root(root_directory, mounts, n_mounts);
+ drop_inaccessible(mounts, n_mounts);
+ drop_nop(mounts, n_mounts);
+}
+
+static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
+ int r;
+
+ STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
+ _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+ src_abs = path_join(root, *src);
+ dst_abs = path_join(root, *dst);
+ if (!src_abs || !dst_abs)
+ return -ENOMEM;
+
+ r = mkdir_parents_label(dst_abs, 0755);
+ if (r < 0)
+ return log_debug_errno(
+ r,
+ "Failed to create parent directory for symlink '%s': %m",
+ dst_abs);
+
+ r = symlink_idempotent(src_abs, dst_abs, true);
+ if (r < 0)
+ return log_debug_errno(
+ r,
+ "Failed to create symlink from '%s' to '%s': %m",
+ src_abs,
+ dst_abs);
+ }
+
+ return 0;
+}
+
+static int apply_mounts(
+ const char *root,
+ const NamespaceInfo *ns_info,
+ MountEntry *mounts,
+ size_t *n_mounts,
+ char **exec_dir_symlinks,
+ char **error_path) {
+
+ _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
+ _cleanup_free_ char **deny_list = NULL;
+ int r;
+
+ if (n_mounts == 0) /* Shortcut: nothing to do */
+ return 0;
+
+ assert(root);
+ assert(mounts);
+ assert(n_mounts);
+
+ /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
+ * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
+ proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
+ if (!proc_self_mountinfo) {
+ r = -errno;
+
+ if (error_path)
+ *error_path = strdup("/proc/self/mountinfo");
+
+ return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m");
+ }
+
+ /* First round, establish all mounts we need */
+ for (;;) {
+ bool again = false;
+
+ for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
+
+ if (m->applied)
+ continue;
+
+ /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
+ r = follow_symlink(!IN_SET(m->mode, EXTENSION_IMAGES, EXTENSION_DIRECTORIES) ? root : NULL, m);
+ if (r < 0) {
+ if (error_path && mount_entry_path(m))
+ *error_path = strdup(mount_entry_path(m));
+ return r;
+ }
+ if (r == 0) {
+ /* We hit a symlinked mount point. The entry got rewritten and might
+ * point to a very different place now. Let's normalize the changed
+ * list, and start from the beginning. After all to mount the entry
+ * at the new location we might need some other mounts first */
+ again = true;
+ break;
+ }
+
+ r = apply_one_mount(root, m, ns_info);
+ if (r < 0) {
+ if (error_path && mount_entry_path(m))
+ *error_path = strdup(mount_entry_path(m));
+ return r;
+ }
+
+ m->applied = true;
+ }
+
+ if (!again)
+ break;
+
+ drop_unused_mounts(root, mounts, n_mounts);
+ }
+
+ /* Now that all filesystems have been set up, but before the
+ * read-only switches are flipped, create the exec dirs symlinks.
+ * Note that when /var/lib is not empty/tmpfs, these symlinks will already
+ * exist, which means this will be a no-op. */
+ r = create_symlinks_from_tuples(root, exec_dir_symlinks);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
+
+ /* Create a deny list we can pass to bind_mount_recursive() */
+ deny_list = new(char*, (*n_mounts)+1);
+ if (!deny_list)
+ return -ENOMEM;
+ for (size_t j = 0; j < *n_mounts; j++)
+ deny_list[j] = (char*) mount_entry_path(mounts+j);
+ deny_list[*n_mounts] = NULL;
+
+ /* Second round, flip the ro bits if necessary. */
+ for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
+ r = make_read_only(m, deny_list, proc_self_mountinfo);
+ if (r < 0) {
+ if (error_path && mount_entry_path(m))
+ *error_path = strdup(mount_entry_path(m));
+ return r;
+ }
+ }
+
+ /* Third round, flip the noexec bits with a simplified deny list. */
+ for (size_t j = 0; j < *n_mounts; j++)
+ if (IN_SET((mounts+j)->mode, EXEC, NOEXEC))
+ deny_list[j] = (char*) mount_entry_path(mounts+j);
+ deny_list[*n_mounts] = NULL;
+
+ for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
+ r = make_noexec(m, deny_list, proc_self_mountinfo);
+ if (r < 0) {
+ if (error_path && mount_entry_path(m))
+ *error_path = strdup(mount_entry_path(m));
+ return r;
+ }
+ }
+
+ /* Fourth round, flip the nosuid bits without a deny list. */
+ if (ns_info->mount_nosuid)
+ for (MountEntry *m = mounts; m < mounts + *n_mounts; ++m) {
+ r = make_nosuid(m, proc_self_mountinfo);
+ if (r < 0) {
+ if (error_path && mount_entry_path(m))
+ *error_path = strdup(mount_entry_path(m));
+ return r;
+ }
+ }
+
+ return 1;
+}
+
+static bool root_read_only(
+ char **read_only_paths,
+ ProtectSystem protect_system) {
+
+ /* Determine whether the root directory is going to be read-only given the configured settings. */
+
+ if (protect_system == PROTECT_SYSTEM_STRICT)
+ return true;
+
+ if (prefixed_path_strv_contains(read_only_paths, "/"))
+ return true;
+
+ return false;
+}
+
+static bool home_read_only(
+ char** read_only_paths,
+ char** inaccessible_paths,
+ char** empty_directories,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts,
+ const TemporaryFileSystem *temporary_filesystems,
+ size_t n_temporary_filesystems,
+ ProtectHome protect_home) {
+
+ /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
+ * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
+ * settings. */
+
+ if (protect_home != PROTECT_HOME_NO)
+ return true;
+
+ if (prefixed_path_strv_contains(read_only_paths, "/home") ||
+ prefixed_path_strv_contains(inaccessible_paths, "/home") ||
+ prefixed_path_strv_contains(empty_directories, "/home"))
+ return true;
+
+ for (size_t i = 0; i < n_temporary_filesystems; i++)
+ if (path_equal(temporary_filesystems[i].path, "/home"))
+ return true;
+
+ /* If /home is overmounted with some dir from the host it's not writable. */
+ for (size_t i = 0; i < n_bind_mounts; i++)
+ if (path_equal(bind_mounts[i].destination, "/home"))
+ return true;
+
+ return false;
+}
+
+static int verity_settings_prepare(
+ VeritySettings *verity,
+ const char *root_image,
+ const void *root_hash,
+ size_t root_hash_size,
+ const char *root_hash_path,
+ const void *root_hash_sig,
+ size_t root_hash_sig_size,
+ const char *root_hash_sig_path,
+ const char *verity_data_path) {
+
+ int r;
+
+ assert(verity);
+
+ if (root_hash) {
+ void *d;
+
+ d = memdup(root_hash, root_hash_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash, d);
+ verity->root_hash_size = root_hash_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (root_hash_sig) {
+ void *d;
+
+ d = memdup(root_hash_sig, root_hash_sig_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash_sig, d);
+ verity->root_hash_sig_size = root_hash_sig_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (verity_data_path) {
+ r = free_and_strdup(&verity->data_path, verity_data_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = verity_settings_load(
+ verity,
+ root_image,
+ root_hash_path,
+ root_hash_sig_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load root hash: %m");
+
+ return 0;
+}
+
+int setup_namespace(
+ const char* root_directory,
+ const char* root_image,
+ const MountOptions *root_image_options,
+ const NamespaceInfo *ns_info,
+ char** read_write_paths,
+ char** read_only_paths,
+ char** inaccessible_paths,
+ char** exec_paths,
+ char** no_exec_paths,
+ char** empty_directories,
+ char** exec_dir_symlinks,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts,
+ const TemporaryFileSystem *temporary_filesystems,
+ size_t n_temporary_filesystems,
+ const MountImage *mount_images,
+ size_t n_mount_images,
+ const char* tmp_dir,
+ const char* var_tmp_dir,
+ const char *creds_path,
+ const char *log_namespace,
+ unsigned long mount_flags,
+ const void *root_hash,
+ size_t root_hash_size,
+ const char *root_hash_path,
+ const void *root_hash_sig,
+ size_t root_hash_sig_size,
+ const char *root_hash_sig_path,
+ const char *verity_data_path,
+ const MountImage *extension_images,
+ size_t n_extension_images,
+ char **extension_directories,
+ const char *propagate_dir,
+ const char *incoming_dir,
+ const char *extension_dir,
+ const char *notify_socket,
+ char **error_path) {
+
+ _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+ _cleanup_strv_free_ char **hierarchies = NULL;
+ MountEntry *m = NULL, *mounts = NULL;
+ bool require_prefix = false, setup_propagate = false;
+ const char *root;
+ DissectImageFlags dissect_image_flags =
+ DISSECT_IMAGE_GENERIC_ROOT |
+ DISSECT_IMAGE_REQUIRE_ROOT |
+ DISSECT_IMAGE_DISCARD_ON_LOOP |
+ DISSECT_IMAGE_RELAX_VAR_CHECK |
+ DISSECT_IMAGE_FSCK |
+ DISSECT_IMAGE_USR_NO_ROOT |
+ DISSECT_IMAGE_GROWFS |
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+ size_t n_mounts;
+ int r;
+
+ assert(ns_info);
+
+ /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
+ * we configure take effect */
+ BLOCK_WITH_UMASK(0000);
+
+ if (!isempty(propagate_dir) && !isempty(incoming_dir))
+ setup_propagate = true;
+
+ if (mount_flags == 0)
+ mount_flags = MS_SHARED;
+
+ if (root_image) {
+ /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
+ if (root_read_only(read_only_paths,
+ ns_info->protect_system) &&
+ home_read_only(read_only_paths, inaccessible_paths, empty_directories,
+ bind_mounts, n_bind_mounts, temporary_filesystems, n_temporary_filesystems,
+ ns_info->protect_home) &&
+ strv_isempty(read_write_paths))
+ dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+
+ r = verity_settings_prepare(
+ &verity,
+ root_image,
+ root_hash, root_hash_size, root_hash_path,
+ root_hash_sig, root_hash_sig_size, root_hash_sig_path,
+ verity_data_path);
+ if (r < 0)
+ return r;
+
+ SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, verity.data_path);
+
+ r = loop_device_make_by_path(
+ root_image,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop_device);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+ r = dissect_loop_device(
+ loop_device,
+ &verity,
+ root_image_options,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to dissect image: %m");
+
+ r = dissected_image_load_verity_sig_partition(
+ dissected_image,
+ loop_device->fd,
+ &verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(
+ dissected_image,
+ NULL,
+ &verity,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ }
+
+ if (root_directory)
+ root = root_directory;
+ else {
+ /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
+ * when running tests (test-execute), it might not have been created yet so let's make sure
+ * we create it if it doesn't already exist. */
+ (void) mkdir_p_label("/run/systemd", 0755);
+
+ /* Always create the mount namespace in a temporary directory, instead of operating directly
+ * in the root. The temporary directory prevents any mounts from being potentially obscured
+ * my other mounts we already applied. We use the same mount point for all images, which is
+ * safe, since they all live in their own namespaces after all, and hence won't see each
+ * other. */
+
+ root = "/run/systemd/unit-root";
+ (void) mkdir_label(root, 0700);
+ require_prefix = true;
+ }
+
+ if (n_extension_images > 0 || !strv_isempty(extension_directories)) {
+ r = parse_env_extension_hierarchies(&hierarchies);
+ if (r < 0)
+ return r;
+ }
+
+ n_mounts = namespace_calculate_mounts(
+ ns_info,
+ read_write_paths,
+ read_only_paths,
+ inaccessible_paths,
+ exec_paths,
+ no_exec_paths,
+ empty_directories,
+ n_bind_mounts,
+ n_temporary_filesystems,
+ n_mount_images,
+ n_extension_images,
+ strv_length(extension_directories),
+ strv_length(hierarchies),
+ tmp_dir, var_tmp_dir,
+ creds_path,
+ log_namespace,
+ setup_propagate,
+ notify_socket);
+
+ if (n_mounts > 0) {
+ m = mounts = new0(MountEntry, n_mounts);
+ if (!mounts)
+ return -ENOMEM;
+
+ r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
+ if (r < 0)
+ goto finish;
+
+ r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
+ if (r < 0)
+ goto finish;
+
+ r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
+ if (r < 0)
+ goto finish;
+
+ r = append_access_mounts(&m, exec_paths, EXEC, require_prefix);
+ if (r < 0)
+ goto finish;
+
+ r = append_access_mounts(&m, no_exec_paths, NOEXEC, require_prefix);
+ if (r < 0)
+ goto finish;
+
+ r = append_empty_dir_mounts(&m, empty_directories);
+ if (r < 0)
+ goto finish;
+
+ r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
+ if (r < 0)
+ goto finish;
+
+ r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
+ if (r < 0)
+ goto finish;
+
+ if (tmp_dir) {
+ bool ro = streq(tmp_dir, RUN_SYSTEMD_EMPTY);
+
+ *(m++) = (MountEntry) {
+ .path_const = "/tmp",
+ .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
+ .source_const = tmp_dir,
+ };
+ }
+
+ if (var_tmp_dir) {
+ bool ro = streq(var_tmp_dir, RUN_SYSTEMD_EMPTY);
+
+ *(m++) = (MountEntry) {
+ .path_const = "/var/tmp",
+ .mode = ro ? PRIVATE_TMP_READONLY : PRIVATE_TMP,
+ .source_const = var_tmp_dir,
+ };
+ }
+
+ r = append_mount_images(&m, mount_images, n_mount_images);
+ if (r < 0)
+ goto finish;
+
+ r = append_extensions(&m, root, extension_dir, hierarchies, extension_images, n_extension_images, extension_directories);
+ if (r < 0)
+ goto finish;
+
+ if (ns_info->private_dev)
+ *(m++) = (MountEntry) {
+ .path_const = "/dev",
+ .mode = PRIVATE_DEV,
+ .flags = DEV_MOUNT_OPTIONS,
+ };
+
+ /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the
+ protective mounts to non-pid /proc paths would fail. But the pid only option may have
+ failed gracefully, so let's try the mounts but it's not fatal if they don't succeed. */
+ bool ignore_protect_proc = ns_info->ignore_protect_paths || ns_info->proc_subset == PROC_SUBSET_PID;
+ if (ns_info->protect_kernel_tunables) {
+ r = append_static_mounts(&m,
+ protect_kernel_tunables_proc_table,
+ ELEMENTSOF(protect_kernel_tunables_proc_table),
+ ignore_protect_proc);
+ if (r < 0)
+ goto finish;
+
+ r = append_static_mounts(&m,
+ protect_kernel_tunables_sys_table,
+ ELEMENTSOF(protect_kernel_tunables_sys_table),
+ ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (ns_info->protect_kernel_modules) {
+ r = append_static_mounts(&m,
+ protect_kernel_modules_table,
+ ELEMENTSOF(protect_kernel_modules_table),
+ ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (ns_info->protect_kernel_logs) {
+ r = append_static_mounts(&m,
+ protect_kernel_logs_proc_table,
+ ELEMENTSOF(protect_kernel_logs_proc_table),
+ ignore_protect_proc);
+ if (r < 0)
+ goto finish;
+
+ r = append_static_mounts(&m,
+ protect_kernel_logs_dev_table,
+ ELEMENTSOF(protect_kernel_logs_dev_table),
+ ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (ns_info->protect_control_groups)
+ *(m++) = (MountEntry) {
+ .path_const = "/sys/fs/cgroup",
+ .mode = READONLY,
+ };
+
+ r = append_protect_home(&m, ns_info->protect_home, ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+
+ r = append_protect_system(&m, ns_info->protect_system, false);
+ if (r < 0)
+ goto finish;
+
+ if (namespace_info_mount_apivfs(ns_info)) {
+ r = append_static_mounts(&m,
+ apivfs_table,
+ ELEMENTSOF(apivfs_table),
+ ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+ }
+
+ /* Note, if proc is mounted with subset=pid then neither of the
+ * two paths will exist, i.e. they are implicitly protected by
+ * the mount option. */
+ if (ns_info->protect_hostname) {
+ *(m++) = (MountEntry) {
+ .path_const = "/proc/sys/kernel/hostname",
+ .mode = READONLY,
+ .ignore = ignore_protect_proc,
+ };
+ *(m++) = (MountEntry) {
+ .path_const = "/proc/sys/kernel/domainname",
+ .mode = READONLY,
+ .ignore = ignore_protect_proc,
+ };
+ }
+
+ if (ns_info->private_ipc)
+ *(m++) = (MountEntry) {
+ .path_const = "/dev/mqueue",
+ .mode = MQUEUEFS,
+ .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+ };
+
+ if (creds_path) {
+ /* If our service has a credentials store configured, then bind that one in, but hide
+ * everything else. */
+
+ *(m++) = (MountEntry) {
+ .path_const = "/run/credentials",
+ .mode = TMPFS,
+ .read_only = true,
+ .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+ .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
+ };
+
+ *(m++) = (MountEntry) {
+ .path_const = creds_path,
+ .mode = BIND_MOUNT,
+ .read_only = true,
+ .source_const = creds_path,
+ };
+ } else {
+ /* If our service has no credentials store configured, then make the whole
+ * credentials tree inaccessible wholesale. */
+
+ *(m++) = (MountEntry) {
+ .path_const = "/run/credentials",
+ .mode = INACCESSIBLE,
+ .ignore = true,
+ };
+ }
+
+ if (log_namespace) {
+ _cleanup_free_ char *q = NULL;
+
+ q = strjoin("/run/systemd/journal.", log_namespace);
+ if (!q) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ *(m++) = (MountEntry) {
+ .path_const = "/run/systemd/journal",
+ .mode = BIND_MOUNT_RECURSIVE,
+ .read_only = true,
+ .source_malloc = TAKE_PTR(q),
+ };
+ }
+
+ /* Will be used to add bind mounts at runtime */
+ if (setup_propagate)
+ *(m++) = (MountEntry) {
+ .source_const = propagate_dir,
+ .path_const = incoming_dir,
+ .mode = BIND_MOUNT,
+ .read_only = true,
+ };
+
+ if (notify_socket)
+ *(m++) = (MountEntry) {
+ .path_const = notify_socket,
+ .source_const = notify_socket,
+ .mode = BIND_MOUNT,
+ .read_only = true,
+ };
+
+ assert(mounts + n_mounts == m);
+
+ /* Prepend the root directory where that's necessary */
+ r = prefix_where_needed(mounts, n_mounts, root);
+ if (r < 0)
+ goto finish;
+
+ drop_unused_mounts(root, mounts, &n_mounts);
+ }
+
+ /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
+
+ if (unshare(CLONE_NEWNS) < 0) {
+ r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
+ if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
+ /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
+ * in place that doesn't allow us to create namespaces (or a missing cap), then
+ * propagate a recognizable error back, which the caller can use to detect this case
+ * (and only this) and optionally continue without namespacing applied. */
+ r = -ENOANO;
+
+ goto finish;
+ }
+
+ /* Create the source directory to allow runtime propagation of mounts */
+ if (setup_propagate)
+ (void) mkdir_p(propagate_dir, 0600);
+
+ if (n_extension_images > 0 || !strv_isempty(extension_directories))
+ /* ExtensionImages/Directories mountpoint directories will be created while parsing the
+ * mounts to create, so have the parent ready */
+ (void) mkdir_p(extension_dir, 0600);
+
+ /* Remount / as SLAVE so that nothing now mounted in the namespace
+ * shows up in the parent */
+ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
+ r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
+ goto finish;
+ }
+
+ if (root_image) {
+ /* A root image is specified, mount it to the right place */
+ r = dissected_image_mount(dissected_image, root, UID_INVALID, UID_INVALID, dissect_image_flags);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to mount root image: %m");
+ goto finish;
+ }
+
+ /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
+ * if it likes. */
+ r = loop_device_flock(loop_device, LOCK_UN);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+ goto finish;
+ }
+
+ r = dissected_image_relinquish(dissected_image);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to relinquish dissected image: %m");
+ goto finish;
+ }
+
+ } else if (root_directory) {
+
+ /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
+ r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
+ goto finish;
+ }
+ if (r == 0) {
+ r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ goto finish;
+ }
+
+ } else {
+ /* Let's mount the main root directory to the root directory to use */
+ r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ goto finish;
+ }
+
+ /* Try to set up the new root directory before mounting anything else there. */
+ if (root_image || root_directory)
+ (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
+
+ /* Now make the magic happen */
+ r = apply_mounts(root, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
+ if (r < 0)
+ goto finish;
+
+ /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
+ r = mount_move_root(root);
+ if (r == -EINVAL && root_directory) {
+ /* If we are using root_directory and we don't have privileges (ie: user manager in a user
+ * namespace) and the root_directory is already a mount point in the parent namespace,
+ * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
+ * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
+ * mount point) and try again. */
+ r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ goto finish;
+ r = mount_move_root(root);
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
+ goto finish;
+ }
+
+ /* Remount / as the desired mode. Note that this will not
+ * reestablish propagation from our side to the host, since
+ * what's disconnected is disconnected. */
+ if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
+ r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
+ goto finish;
+ }
+
+ /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only
+ * supported for non-shared mounts. This needs to happen after remounting / or it will fail. */
+ if (setup_propagate) {
+ r = mount(NULL, incoming_dir, NULL, MS_SLAVE, NULL);
+ if (r < 0) {
+ log_error_errno(r, "Failed to remount %s with MS_SLAVE: %m", incoming_dir);
+ goto finish;
+ }
+ }
+
+ r = 0;
+
+finish:
+ if (n_mounts > 0)
+ for (m = mounts; m < mounts + n_mounts; m++)
+ mount_entry_done(m);
+
+ free(mounts);
+
+ return r;
+}
+
+void bind_mount_free_many(BindMount *b, size_t n) {
+ assert(b || n == 0);
+
+ for (size_t i = 0; i < n; i++) {
+ free(b[i].source);
+ free(b[i].destination);
+ }
+
+ free(b);
+}
+
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
+ _cleanup_free_ char *s = NULL, *d = NULL;
+ BindMount *c;
+
+ assert(b);
+ assert(n);
+ assert(item);
+
+ s = strdup(item->source);
+ if (!s)
+ return -ENOMEM;
+
+ d = strdup(item->destination);
+ if (!d)
+ return -ENOMEM;
+
+ c = reallocarray(*b, *n + 1, sizeof(BindMount));
+ if (!c)
+ return -ENOMEM;
+
+ *b = c;
+
+ c[(*n) ++] = (BindMount) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .read_only = item->read_only,
+ .nosuid = item->nosuid,
+ .recursive = item->recursive,
+ .ignore_enoent = item->ignore_enoent,
+ };
+
+ return 0;
+}
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n) {
+ assert(n);
+ assert(m || *n == 0);
+
+ for (size_t i = 0; i < *n; i++) {
+ free(m[i].source);
+ free(m[i].destination);
+ mount_options_free_all(m[i].mount_options);
+ }
+
+ free(m);
+ *n = 0;
+ return NULL;
+}
+
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
+ _cleanup_free_ char *s = NULL, *d = NULL;
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ MountImage *c;
+
+ assert(m);
+ assert(n);
+ assert(item);
+
+ s = strdup(item->source);
+ if (!s)
+ return -ENOMEM;
+
+ if (item->destination) {
+ d = strdup(item->destination);
+ if (!d)
+ return -ENOMEM;
+ }
+
+ LIST_FOREACH(mount_options, i, item->mount_options) {
+ _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return -ENOMEM;
+
+ *o = (MountOptions) {
+ .partition_designator = i->partition_designator,
+ .options = strdup(i->options),
+ };
+ if (!o->options)
+ return -ENOMEM;
+
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+
+ c = reallocarray(*m, *n + 1, sizeof(MountImage));
+ if (!c)
+ return -ENOMEM;
+
+ *m = c;
+
+ c[(*n) ++] = (MountImage) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .mount_options = TAKE_PTR(options),
+ .ignore_enoent = item->ignore_enoent,
+ .type = item->type,
+ };
+
+ return 0;
+}
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
+ assert(t || n == 0);
+
+ for (size_t i = 0; i < n; i++) {
+ free(t[i].path);
+ free(t[i].options);
+ }
+
+ free(t);
+}
+
+int temporary_filesystem_add(
+ TemporaryFileSystem **t,
+ size_t *n,
+ const char *path,
+ const char *options) {
+
+ _cleanup_free_ char *p = NULL, *o = NULL;
+ TemporaryFileSystem *c;
+
+ assert(t);
+ assert(n);
+ assert(path);
+
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+
+ if (!isempty(options)) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
+ if (!c)
+ return -ENOMEM;
+
+ *t = c;
+
+ c[(*n) ++] = (TemporaryFileSystem) {
+ .path = TAKE_PTR(p),
+ .options = TAKE_PTR(o),
+ };
+
+ return 0;
+}
+
+static int make_tmp_prefix(const char *prefix) {
+ _cleanup_free_ char *t = NULL;
+ _cleanup_close_ int fd = -1;
+ int r;
+
+ /* Don't do anything unless we know the dir is actually missing */
+ r = access(prefix, F_OK);
+ if (r >= 0)
+ return 0;
+ if (errno != ENOENT)
+ return -errno;
+
+ RUN_WITH_UMASK(000)
+ r = mkdir_parents(prefix, 0755);
+ if (r < 0)
+ return r;
+
+ r = tempfn_random(prefix, NULL, &t);
+ if (r < 0)
+ return r;
+
+ /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
+ * the suid bit, below. */
+ fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777);
+ if (fd < 0)
+ return fd;
+
+ r = RET_NERRNO(fchmod(fd, 01777));
+ if (r < 0) {
+ (void) rmdir(t);
+ return r;
+ }
+
+ r = RET_NERRNO(rename(t, prefix));
+ if (r < 0) {
+ (void) rmdir(t);
+ return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
+ }
+
+ return 0;
+
+}
+
+static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
+ _cleanup_free_ char *x = NULL;
+ _cleanup_free_ char *y = NULL;
+ sd_id128_t boot_id;
+ bool rw = true;
+ int r;
+
+ assert(id);
+ assert(prefix);
+ assert(path);
+
+ /* We include the boot id in the directory so that after a
+ * reboot we can easily identify obsolete directories. */
+
+ r = sd_id128_get_boot(&boot_id);
+ if (r < 0)
+ return r;
+
+ x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
+ if (!x)
+ return -ENOMEM;
+
+ r = make_tmp_prefix(prefix);
+ if (r < 0)
+ return r;
+
+ RUN_WITH_UMASK(0077)
+ if (!mkdtemp(x)) {
+ if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
+ rw = false;
+ else
+ return -errno;
+ }
+
+ if (rw) {
+ y = strjoin(x, "/tmp");
+ if (!y)
+ return -ENOMEM;
+
+ RUN_WITH_UMASK(0000)
+ if (mkdir(y, 0777 | S_ISVTX) < 0)
+ return -errno;
+
+ r = label_fix_full(AT_FDCWD, y, prefix, 0);
+ if (r < 0)
+ return r;
+
+ if (tmp_path)
+ *tmp_path = TAKE_PTR(y);
+ } else {
+ /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
+ * read-only. This way the service will get the EROFS result as if it was writing to the real
+ * file system. */
+ RUN_WITH_UMASK(0000)
+ r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
+ if (r < 0)
+ return r;
+ }
+
+ *path = TAKE_PTR(x);
+ return 0;
+}
+
+int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
+ _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
+ _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
+ char *b;
+ int r;
+
+ assert(id);
+ assert(tmp_dir);
+ assert(var_tmp_dir);
+
+ r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
+ if (r < 0)
+ return r;
+
+ r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
+ if (r < 0)
+ return r;
+
+ a_tmp = mfree(a_tmp); /* avoid rmdir */
+ *tmp_dir = TAKE_PTR(a);
+ *var_tmp_dir = TAKE_PTR(b);
+
+ return 0;
+}
+
+int setup_shareable_ns(const int ns_storage_socket[static 2], unsigned long nsflag) {
+ _cleanup_close_ int ns = -1;
+ int r, q;
+ const char *ns_name, *ns_path;
+
+ assert(ns_storage_socket);
+ assert(ns_storage_socket[0] >= 0);
+ assert(ns_storage_socket[1] >= 0);
+
+ ns_name = namespace_single_flag_to_string(nsflag);
+ assert(ns_name);
+
+ /* We use the passed socketpair as a storage buffer for our
+ * namespace reference fd. Whatever process runs this first
+ * shall create a new namespace, all others should just join
+ * it. To serialize that we use a file lock on the socket
+ * pair.
+ *
+ * It's a bit crazy, but hey, works great! */
+
+ if (lockf(ns_storage_socket[0], F_LOCK, 0) < 0)
+ return -errno;
+
+ ns = receive_one_fd(ns_storage_socket[0], MSG_DONTWAIT);
+ if (ns == -EAGAIN) {
+ /* Nothing stored yet, so let's create a new namespace. */
+
+ if (unshare(nsflag) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ (void) loopback_setup();
+
+ ns_path = strjoina("/proc/self/ns/", ns_name);
+ ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (ns < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ r = 1;
+
+ } else if (ns < 0) {
+ r = ns;
+ goto fail;
+
+ } else {
+ /* Yay, found something, so let's join the namespace */
+ if (setns(ns, nsflag) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ r = 0;
+ }
+
+ q = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+ if (q < 0) {
+ r = q;
+ goto fail;
+ }
+
+fail:
+ (void) lockf(ns_storage_socket[0], F_ULOCK, 0);
+ return r;
+}
+
+int open_shareable_ns_path(const int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
+ _cleanup_close_ int ns = -1;
+ int q, r;
+
+ assert(ns_storage_socket);
+ assert(ns_storage_socket[0] >= 0);
+ assert(ns_storage_socket[1] >= 0);
+ assert(path);
+
+ /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
+ * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
+ * allocate a new anonymous ns if needed. */
+
+ if (lockf(ns_storage_socket[0], F_LOCK, 0) < 0)
+ return -errno;
+
+ ns = receive_one_fd(ns_storage_socket[0], MSG_DONTWAIT);
+ if (ns == -EAGAIN) {
+ /* Nothing stored yet. Open the file from the file system. */
+
+ ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+ if (ns < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ r = fd_is_ns(ns, nsflag);
+ if (r == 0) { /* Not a ns of our type? Refuse early. */
+ r = -EINVAL;
+ goto fail;
+ }
+ if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
+ goto fail;
+
+ r = 1;
+
+ } else if (ns < 0) {
+ r = ns;
+ goto fail;
+ } else
+ r = 0; /* Already allocated */
+
+ q = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+ if (q < 0) {
+ r = q;
+ goto fail;
+ }
+
+fail:
+ (void) lockf(ns_storage_socket[0], F_ULOCK, 0);
+ return r;
+}
+
+bool ns_type_supported(NamespaceType type) {
+ const char *t, *ns_proc;
+
+ t = namespace_type_to_string(type);
+ if (!t) /* Don't know how to translate this? Then it's not supported */
+ return false;
+
+ ns_proc = strjoina("/proc/self/ns/", t);
+ return access(ns_proc, F_OK) == 0;
+}
+
+static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
+ [PROTECT_HOME_NO] = "no",
+ [PROTECT_HOME_YES] = "yes",
+ [PROTECT_HOME_READ_ONLY] = "read-only",
+ [PROTECT_HOME_TMPFS] = "tmpfs",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
+
+static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
+ [PROTECT_SYSTEM_NO] = "no",
+ [PROTECT_SYSTEM_YES] = "yes",
+ [PROTECT_SYSTEM_FULL] = "full",
+ [PROTECT_SYSTEM_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
+
+static const char* const namespace_type_table[] = {
+ [NAMESPACE_MOUNT] = "mnt",
+ [NAMESPACE_CGROUP] = "cgroup",
+ [NAMESPACE_UTS] = "uts",
+ [NAMESPACE_IPC] = "ipc",
+ [NAMESPACE_USER] = "user",
+ [NAMESPACE_PID] = "pid",
+ [NAMESPACE_NET] = "net",
+ [NAMESPACE_TIME] = "time",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
+
+static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
+ [PROTECT_PROC_DEFAULT] = "default",
+ [PROTECT_PROC_NOACCESS] = "noaccess",
+ [PROTECT_PROC_INVISIBLE] = "invisible",
+ [PROTECT_PROC_PTRACEABLE] = "ptraceable",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
+
+static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
+ [PROC_SUBSET_ALL] = "all",
+ [PROC_SUBSET_PID] = "pid",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
diff --git a/src/core/namespace.h b/src/core/namespace.h
new file mode 100644
index 0000000..2ba5970
--- /dev/null
+++ b/src/core/namespace.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2016 Djalal Harouni
+***/
+
+typedef struct NamespaceInfo NamespaceInfo;
+typedef struct BindMount BindMount;
+typedef struct TemporaryFileSystem TemporaryFileSystem;
+typedef struct MountImage MountImage;
+
+#include <stdbool.h>
+
+#include "dissect-image.h"
+#include "fs-util.h"
+#include "macro.h"
+#include "namespace-util.h"
+#include "string-util.h"
+
+typedef enum ProtectHome {
+ PROTECT_HOME_NO,
+ PROTECT_HOME_YES,
+ PROTECT_HOME_READ_ONLY,
+ PROTECT_HOME_TMPFS,
+ _PROTECT_HOME_MAX,
+ _PROTECT_HOME_INVALID = -EINVAL,
+} ProtectHome;
+
+typedef enum ProtectSystem {
+ PROTECT_SYSTEM_NO,
+ PROTECT_SYSTEM_YES,
+ PROTECT_SYSTEM_FULL,
+ PROTECT_SYSTEM_STRICT,
+ _PROTECT_SYSTEM_MAX,
+ _PROTECT_SYSTEM_INVALID = -EINVAL,
+} ProtectSystem;
+
+typedef enum ProtectProc {
+ PROTECT_PROC_DEFAULT,
+ PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
+ PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
+ PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
+ _PROTECT_PROC_MAX,
+ _PROTECT_PROC_INVALID = -EINVAL,
+} ProtectProc;
+
+typedef enum ProcSubset {
+ PROC_SUBSET_ALL,
+ PROC_SUBSET_PID, /* subset=pid */
+ _PROC_SUBSET_MAX,
+ _PROC_SUBSET_INVALID = -EINVAL,
+} ProcSubset;
+
+struct NamespaceInfo {
+ bool ignore_protect_paths;
+ bool private_dev;
+ bool private_mounts;
+ bool protect_control_groups;
+ bool protect_kernel_tunables;
+ bool protect_kernel_modules;
+ bool protect_kernel_logs;
+ bool mount_apivfs;
+ bool protect_hostname;
+ bool private_ipc;
+ bool mount_nosuid;
+ ProtectHome protect_home;
+ ProtectSystem protect_system;
+ ProtectProc protect_proc;
+ ProcSubset proc_subset;
+};
+
+struct BindMount {
+ char *source;
+ char *destination;
+ bool read_only;
+ bool nosuid;
+ bool recursive;
+ bool ignore_enoent;
+};
+
+struct TemporaryFileSystem {
+ char *path;
+ char *options;
+};
+
+typedef enum MountImageType {
+ MOUNT_IMAGE_DISCRETE,
+ MOUNT_IMAGE_EXTENSION,
+ _MOUNT_IMAGE_TYPE_MAX,
+ _MOUNT_IMAGE_TYPE_INVALID = -EINVAL,
+} MountImageType;
+
+struct MountImage {
+ char *source;
+ char *destination; /* Unused if MountImageType == MOUNT_IMAGE_EXTENSION */
+ LIST_HEAD(MountOptions, mount_options);
+ bool ignore_enoent;
+ MountImageType type;
+};
+
+int setup_namespace(
+ const char *root_directory,
+ const char *root_image,
+ const MountOptions *root_image_options,
+ const NamespaceInfo *ns_info,
+ char **read_write_paths,
+ char **read_only_paths,
+ char **inaccessible_paths,
+ char **exec_paths,
+ char **no_exec_paths,
+ char **empty_directories,
+ char **exec_dir_symlinks,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts,
+ const TemporaryFileSystem *temporary_filesystems,
+ size_t n_temporary_filesystems,
+ const MountImage *mount_images,
+ size_t n_mount_images,
+ const char *tmp_dir,
+ const char *var_tmp_dir,
+ const char *creds_path,
+ const char *log_namespace,
+ unsigned long mount_flags,
+ const void *root_hash,
+ size_t root_hash_size,
+ const char *root_hash_path,
+ const void *root_hash_sig,
+ size_t root_hash_sig_size,
+ const char *root_hash_sig_path,
+ const char *root_verity,
+ const MountImage *extension_images,
+ size_t n_extension_images,
+ char **extension_directories,
+ const char *propagate_dir,
+ const char *incoming_dir,
+ const char *extension_dir,
+ const char *notify_socket,
+ char **error_path);
+
+#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
+
+static inline char* namespace_cleanup_tmpdir(char *p) {
+ PROTECT_ERRNO;
+ if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
+ (void) rmdir(p);
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir);
+
+int setup_tmp_dirs(
+ const char *id,
+ char **tmp_dir,
+ char **var_tmp_dir);
+
+int setup_shareable_ns(const int ns_storage_socket[static 2], unsigned long nsflag);
+int open_shareable_ns_path(const int netns_storage_socket[static 2], const char *path, unsigned long nsflag);
+
+const char* protect_home_to_string(ProtectHome p) _const_;
+ProtectHome protect_home_from_string(const char *s) _pure_;
+
+const char* protect_system_to_string(ProtectSystem p) _const_;
+ProtectSystem protect_system_from_string(const char *s) _pure_;
+
+const char* protect_proc_to_string(ProtectProc i) _const_;
+ProtectProc protect_proc_from_string(const char *s) _pure_;
+
+const char* proc_subset_to_string(ProcSubset i) _const_;
+ProcSubset proc_subset_from_string(const char *s) _pure_;
+
+void bind_mount_free_many(BindMount *b, size_t n);
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n);
+int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n,
+ const char *path, const char *options);
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n);
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item);
+
+const char* namespace_type_to_string(NamespaceType t) _const_;
+NamespaceType namespace_type_from_string(const char *s) _pure_;
+
+bool ns_type_supported(NamespaceType type);
diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf
new file mode 100644
index 0000000..fe9de01
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.conf
@@ -0,0 +1,428 @@
+<?xml version="1.0"?> <!--*-nxml-*-->
+<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd">
+
+<!--
+ SPDX-License-Identifier: LGPL-2.1-or-later
+
+ This file is part of systemd.
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+-->
+
+<busconfig>
+
+ <policy user="root">
+ <allow own="org.freedesktop.systemd1"/>
+
+ <!-- Root clients can do everything -->
+ <allow send_destination="org.freedesktop.systemd1"/>
+ <allow receive_sender="org.freedesktop.systemd1"/>
+
+ <!-- systemd may receive activator requests -->
+ <allow receive_interface="org.freedesktop.systemd1.Activator"
+ receive_member="ActivationRequest"/>
+ </policy>
+
+ <policy context="default">
+ <deny send_destination="org.freedesktop.systemd1"/>
+
+ <!-- Completely open to anyone: org.freedesktop.DBus.* interfaces -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Introspectable"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Peer"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="Get"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="GetAll"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Manager interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByPID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByInvocationID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByControlGroup"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LoadUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitProcesses"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJob"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJobAfter"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJobBefore"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnits"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsFiltered"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsByPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsByNames"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListJobs"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Subscribe"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Unsubscribe"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Dump"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpByFileDescriptor"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpUnitsMatchingPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpUnitsMatchingPatternsByFileDescriptor"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitFilesByPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitFileState"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetDefaultTarget"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitFileLinks"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LookupDynamicUserByName"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LookupDynamicUserByUID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetDynamicUsers"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Unit interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Slice interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Slice"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Scope interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Scope"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Socket interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Socket"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Mount interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Mount"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Swap interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Swap"
+ send_member="GetProcesses"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Manager interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartUnitReplace"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StopUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="TryRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadOrRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadOrTryRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="BindMountUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="MountImageUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="KillUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ResetFailedUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetUnitProperties"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RefUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="UnrefUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartTransientUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="AttachProcessesToUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="CancelJob"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ClearJobs"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ResetFailed"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Reload"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Reexecute"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="EnableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DisableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReenableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LinkUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetUnitFilesWithMode"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="MaskUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="UnmaskUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RevertUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetDefaultTarget"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetAllUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="AddDependencyUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetShowStatus"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Job interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="Cancel"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="GetAfter"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="GetBefore"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Unit interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Start"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Stop"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Reload"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Restart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="TryRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ReloadOrRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ReloadOrTryRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Kill"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ResetFailed"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="SetProperties"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Ref"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Unref"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Service interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="AttachProcesses"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="BindMount"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="MountImage"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Scope interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Scope"
+ send_member="AttachProcesses"/>
+
+ <allow receive_sender="org.freedesktop.systemd1"/>
+ </policy>
+
+</busconfig>
diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in
new file mode 100644
index 0000000..9e9a20f
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.policy.in
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*-->
+<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd">
+
+<!--
+ SPDX-License-Identifier: LGPL-2.1-or-later
+
+ This file is part of systemd.
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+-->
+
+<policyconfig>
+
+ <vendor>The systemd Project</vendor>
+ <vendor_url>https://systemd.io</vendor_url>
+
+ <action id="org.freedesktop.systemd1.reply-password">
+ <description gettext-domain="systemd">Send passphrase back to system</description>
+ <message gettext-domain="systemd">Authentication is required to send the entered passphrase back to the system.</message>
+ <defaults>
+ <allow_any>no</allow_any>
+ <allow_inactive>no</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.exec.path">{{ROOTLIBEXECDIR}}/systemd-reply-password</annotate>
+ </action>
+
+ <action id="org.freedesktop.systemd1.manage-units">
+ <description gettext-domain="systemd">Manage system services or other units</description>
+ <message gettext-domain="systemd">Authentication is required to manage system services or other units.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.manage-unit-files">
+ <description gettext-domain="systemd">Manage system service or unit files</description>
+ <message gettext-domain="systemd">Authentication is required to manage system service or unit files.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.imply">org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units</annotate>
+ </action>
+
+ <action id="org.freedesktop.systemd1.set-environment">
+ <description gettext-domain="systemd">Set or unset system and service manager environment variables</description>
+ <message gettext-domain="systemd">Authentication is required to set or unset system and service manager environment variables.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.reload-daemon">
+ <description gettext-domain="systemd">Reload the systemd state</description>
+ <message gettext-domain="systemd">Authentication is required to reload the systemd state.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.bypass-dump-ratelimit">
+ <description gettext-domain="systemd">Dump the systemd state without rate limits</description>
+ <message gettext-domain="systemd">Authentication is required to dump the systemd state without rate limits.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+</policyconfig>
diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service
new file mode 100644
index 0000000..082125f
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.service
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.systemd1
+Exec=/bin/false
+User=root
diff --git a/src/core/path.c b/src/core/path.c
new file mode 100644
index 0000000..a8144c3
--- /dev/null
+++ b/src/core/path.c
@@ -0,0 +1,1072 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <unistd.h>
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-path.h"
+#include "dbus-unit.h"
+#include "escape.h"
+#include "event-util.h"
+#include "fd-util.h"
+#include "glob-util.h"
+#include "inotify-util.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path.h"
+#include "path-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = {
+ [PATH_DEAD] = UNIT_INACTIVE,
+ [PATH_WAITING] = UNIT_ACTIVE,
+ [PATH_RUNNING] = UNIT_ACTIVE,
+ [PATH_FAILED] = UNIT_FAILED,
+};
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) {
+ static const int flags_table[_PATH_TYPE_MAX] = {
+ [PATH_EXISTS] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+ [PATH_EXISTS_GLOB] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+ [PATH_CHANGED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO,
+ [PATH_MODIFIED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY,
+ [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO,
+ };
+
+ bool exists = false;
+ char *slash, *oldslash = NULL;
+ int r;
+
+ assert(s);
+ assert(s->unit);
+ assert(handler);
+
+ path_spec_unwatch(s);
+
+ s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (s->inotify_fd < 0) {
+ r = log_error_errno(errno, "Failed to allocate inotify fd: %m");
+ goto fail;
+ }
+
+ r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add inotify fd to event loop: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(s->event_source, "path");
+
+ /* This function assumes the path was passed through path_simplify()! */
+ assert(!strstr(s->path, "//"));
+
+ for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) {
+ bool incomplete = false;
+ int flags, wd = -1;
+ char tmp, *cut;
+
+ if (slash) {
+ cut = slash + (slash == s->path);
+ tmp = *cut;
+ *cut = '\0';
+
+ flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO;
+ } else {
+ cut = NULL;
+ flags = flags_table[s->type];
+ }
+
+ /* If this is a symlink watch both the symlink inode and where it points to. If the inode is
+ * not a symlink both calls will install the same watch, which is redundant and doesn't
+ * hurt. */
+ for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) {
+ uint32_t f = flags;
+
+ SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink);
+
+ wd = inotify_add_watch(s->inotify_fd, s->path, f);
+ if (wd < 0) {
+ if (IN_SET(errno, EACCES, ENOENT)) {
+ incomplete = true; /* This is an expected error, let's accept this
+ * quietly: we have an incomplete watch for
+ * now. */
+ break;
+ }
+
+ /* This second call to inotify_add_watch() should fail like the previous one
+ * and is done for logging the error in a comprehensive way. */
+ wd = inotify_add_watch_and_warn(s->inotify_fd, s->path, f);
+ if (wd < 0) {
+ if (cut)
+ *cut = tmp;
+
+ r = wd;
+ goto fail;
+ }
+
+ /* Hmm, we succeeded in adding the watch this time... let's continue. */
+ }
+ }
+
+ if (incomplete) {
+ if (cut)
+ *cut = tmp;
+
+ break;
+ }
+
+ exists = true;
+
+ /* Path exists, we don't need to watch parent too closely. */
+ if (oldslash) {
+ char *cut2 = oldslash + (oldslash == s->path);
+ char tmp2 = *cut2;
+ *cut2 = '\0';
+
+ (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF);
+ /* Error is ignored, the worst can happen is we get spurious events. */
+
+ *cut2 = tmp2;
+ }
+
+ if (cut)
+ *cut = tmp;
+
+ if (slash)
+ oldslash = slash;
+ else {
+ /* whole path has been iterated over */
+ s->primary_wd = wd;
+ break;
+ }
+ }
+
+ if (!exists) {
+ r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path);
+ /* either EACCESS or ENOENT */
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ path_spec_unwatch(s);
+ return r;
+}
+
+void path_spec_unwatch(PathSpec *s) {
+ assert(s);
+
+ s->event_source = sd_event_source_disable_unref(s->event_source);
+ s->inotify_fd = safe_close(s->inotify_fd);
+}
+
+int path_spec_fd_event(PathSpec *s, uint32_t revents) {
+ union inotify_event_buffer buffer;
+ ssize_t l;
+
+ assert(s);
+
+ if (revents != EPOLLIN)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Got invalid poll event on inotify.");
+
+ l = read(s->inotify_fd, &buffer, sizeof(buffer));
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read inotify event: %m");
+ }
+
+ if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED))
+ FOREACH_INOTIFY_EVENT_WARN(e, buffer, l)
+ if (s->primary_wd == e->wd)
+ return 1;
+
+ return 0;
+}
+
+static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+ _cleanup_free_ char *trigger = NULL;
+ bool b, good = false;
+
+ assert(s);
+ assert(ret_trigger_path);
+
+ switch (s->type) {
+
+ case PATH_EXISTS:
+ good = access(s->path, F_OK) >= 0;
+ break;
+
+ case PATH_EXISTS_GLOB:
+ good = glob_first(s->path, &trigger) > 0;
+ break;
+
+ case PATH_DIRECTORY_NOT_EMPTY: {
+ int k;
+
+ k = dir_is_empty(s->path, /* ignore_hidden_or_backup= */ true);
+ good = !(IN_SET(k, -ENOENT, -ENOTDIR) || k > 0);
+ break;
+ }
+
+ case PATH_CHANGED:
+ case PATH_MODIFIED:
+ b = access(s->path, F_OK) >= 0;
+ good = !initial && !from_trigger_notify && b != s->previous_exists;
+ s->previous_exists = b;
+ break;
+
+ default:
+ ;
+ }
+
+ if (good) {
+ if (!trigger) {
+ trigger = strdup(s->path);
+ if (!trigger)
+ (void) log_oom_debug();
+ }
+ *ret_trigger_path = TAKE_PTR(trigger);
+ }
+
+ return good;
+}
+
+static void path_spec_mkdir(PathSpec *s, mode_t mode) {
+ int r;
+
+ if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB))
+ return;
+
+ r = mkdir_p_label(s->path, mode);
+ if (r < 0)
+ log_warning_errno(r, "mkdir(%s) failed: %m", s->path);
+}
+
+static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) {
+ const char *type;
+
+ assert_se(type = path_type_to_string(s->type));
+ fprintf(f, "%s%s: %s\n", prefix, type, s->path);
+}
+
+void path_spec_done(PathSpec *s) {
+ assert(s);
+ assert(s->inotify_fd == -1);
+
+ free(s->path);
+}
+
+static void path_init(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ p->directory_mode = 0755;
+
+ p->trigger_limit.interval = USEC_INFINITY;
+ p->trigger_limit.burst = UINT_MAX;
+}
+
+void path_free_specs(Path *p) {
+ PathSpec *s;
+
+ assert(p);
+
+ while ((s = p->specs)) {
+ path_spec_unwatch(s);
+ LIST_REMOVE(spec, p->specs, s);
+ path_spec_done(s);
+ free(s);
+ }
+}
+
+static void path_done(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+
+ p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source);
+ path_free_specs(p);
+}
+
+static int path_add_mount_dependencies(Path *p) {
+ int r;
+
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs) {
+ r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int path_verify(Path *p) {
+ assert(p);
+ assert(UNIT(p)->load_state == UNIT_LOADED);
+
+ if (!p->specs)
+ return log_unit_error_errno(UNIT(p), SYNTHETIC_ERRNO(ENOEXEC), "Path unit lacks path setting. Refusing.");
+
+ return 0;
+}
+
+static int path_add_default_dependencies(Path *p) {
+ int r;
+
+ assert(p);
+
+ if (!UNIT(p)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int path_add_trigger_dependencies(Path *p) {
+ Unit *x;
+ int r;
+
+ assert(p);
+
+ if (UNIT_TRIGGER(UNIT(p)))
+ return 0;
+
+ r = unit_load_related_unit(UNIT(p), ".service", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int path_add_extras(Path *p) {
+ int r;
+
+ assert(p);
+
+ /* To avoid getting pid1 in a busy-loop state (eg: unmet condition on associated service),
+ * set a default trigger limit if the user didn't specify any. */
+ if (p->trigger_limit.interval == USEC_INFINITY)
+ p->trigger_limit.interval = 2 * USEC_PER_SEC;
+
+ if (p->trigger_limit.burst == UINT_MAX)
+ p->trigger_limit.burst = 200;
+
+ r = path_add_trigger_dependencies(p);
+ if (r < 0)
+ return r;
+
+ r = path_add_mount_dependencies(p);
+ if (r < 0)
+ return r;
+
+ return path_add_default_dependencies(p);
+}
+
+static int path_load(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = path_add_extras(p);
+ if (r < 0)
+ return r;
+
+ return path_verify(p);
+}
+
+static void path_dump(Unit *u, FILE *f, const char *prefix) {
+ Path *p = PATH(u);
+ Unit *trigger;
+
+ assert(p);
+ assert(f);
+
+ trigger = UNIT_TRIGGER(u);
+
+ fprintf(f,
+ "%sPath State: %s\n"
+ "%sResult: %s\n"
+ "%sUnit: %s\n"
+ "%sMakeDirectory: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sTriggerLimitIntervalSec: %s\n"
+ "%sTriggerLimitBurst: %u\n",
+ prefix, path_state_to_string(p->state),
+ prefix, path_result_to_string(p->result),
+ prefix, trigger ? trigger->id : "n/a",
+ prefix, yes_no(p->make_directory),
+ prefix, p->directory_mode,
+ prefix, FORMAT_TIMESPAN(p->trigger_limit.interval, USEC_PER_SEC),
+ prefix, p->trigger_limit.burst);
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_dump(s, f, prefix);
+}
+
+static void path_unwatch(Path *p) {
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_unwatch(s);
+}
+
+static int path_watch(Path *p) {
+ int r;
+
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs) {
+ r = path_spec_watch(s, path_dispatch_io);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void path_set_state(Path *p, PathState state) {
+ PathState old_state;
+ assert(p);
+
+ if (p->state != state)
+ bus_unit_send_pending_change_signal(UNIT(p), false);
+
+ old_state = p->state;
+ p->state = state;
+
+ if (!IN_SET(state, PATH_WAITING, PATH_RUNNING))
+ path_unwatch(p);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state));
+
+ unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify);
+
+static int path_coldplug(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(p->state == PATH_DEAD);
+
+ if (p->deserialized_state != p->state) {
+
+ if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING))
+ path_enter_waiting(p, true, false);
+ else
+ path_set_state(p, p->deserialized_state);
+ }
+
+ return 0;
+}
+
+static void path_enter_dead(Path *p, PathResult f) {
+ assert(p);
+
+ if (p->result == PATH_SUCCESS)
+ p->result = f;
+
+ unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result));
+ path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD);
+}
+
+static void path_enter_running(Path *p, char *trigger_path) {
+ _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ Job *job;
+ int r;
+
+ assert(p);
+
+ /* Don't start job if we are supposed to go down */
+ if (unit_stop_pending(UNIT(p)))
+ return;
+
+ if (!ratelimit_below(&p->trigger_limit)) {
+ log_unit_warning(UNIT(p), "Trigger limit hit, refusing further activation.");
+ path_enter_dead(p, PATH_FAILURE_TRIGGER_LIMIT_HIT);
+ return;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (!trigger) {
+ log_unit_error(UNIT(p), "Unit to trigger vanished.");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+
+ details = activation_details_new(UNIT(p));
+ if (!details) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = free_and_strdup(&(ACTIVATION_DETAILS_PATH(details))->trigger_path_filename, trigger_path);
+ if (r < 0)
+ goto fail;
+
+ r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0)
+ goto fail;
+
+ job_set_activation_details(job, details);
+
+ path_set_state(p, PATH_RUNNING);
+ path_unwatch(p);
+
+ return;
+
+fail:
+ log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+}
+
+static bool path_check_good(Path *p, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+ assert(p);
+ assert(ret_trigger_path);
+
+ LIST_FOREACH(spec, s, p->specs)
+ if (path_spec_check_good(s, initial, from_trigger_notify, ret_trigger_path))
+ return true;
+
+ return false;
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify) {
+ _cleanup_free_ char *trigger_path = NULL;
+ Unit *trigger;
+ int r;
+
+ if (p->trigger_notify_event_source)
+ (void) event_source_disable(p->trigger_notify_event_source);
+
+ /* If the triggered unit is already running, so are we */
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (trigger && !UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger))) {
+ path_set_state(p, PATH_RUNNING);
+ path_unwatch(p);
+ return;
+ }
+
+ if (path_check_good(p, initial, from_trigger_notify, &trigger_path)) {
+ log_unit_debug(UNIT(p), "Got triggered.");
+ path_enter_running(p, trigger_path);
+ return;
+ }
+
+ r = path_watch(p);
+ if (r < 0)
+ goto fail;
+
+ /* Hmm, so now we have created inotify watches, but the file
+ * might have appeared/been removed by now, so we must
+ * recheck */
+
+ if (path_check_good(p, false, from_trigger_notify, &trigger_path)) {
+ log_unit_debug(UNIT(p), "Got triggered.");
+ path_enter_running(p, trigger_path);
+ return;
+ }
+
+ path_set_state(p, PATH_WAITING);
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+}
+
+static void path_mkdir(Path *p) {
+ assert(p);
+
+ if (!p->make_directory)
+ return;
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_mkdir(s, p->directory_mode);
+}
+
+static int path_start(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(p);
+ assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED));
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ path_mkdir(p);
+
+ p->result = PATH_SUCCESS;
+ path_enter_waiting(p, true, false);
+
+ return 1;
+}
+
+static int path_stop(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING));
+
+ path_enter_dead(p, PATH_SUCCESS);
+ return 1;
+}
+
+static int path_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", path_state_to_string(p->state));
+ (void) serialize_item(f, "result", path_result_to_string(p->result));
+
+ LIST_FOREACH(spec, s, p->specs) {
+ const char *type;
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape(s->path);
+ if (!escaped)
+ return log_oom();
+
+ assert_se(type = path_type_to_string(s->type));
+ (void) serialize_item_format(f, "path-spec", "%s %i %s",
+ type,
+ s->previous_exists,
+ escaped);
+ }
+
+ return 0;
+}
+
+static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ PathState state;
+
+ state = path_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ p->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ PathResult f;
+
+ f = path_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != PATH_SUCCESS)
+ p->result = f;
+
+ } else if (streq(key, "path-spec")) {
+ int previous_exists, skip = 0;
+ _cleanup_free_ char *type_str = NULL;
+
+ if (sscanf(value, "%ms %i %n", &type_str, &previous_exists, &skip) < 2)
+ log_unit_debug(u, "Failed to parse path-spec value: %s", value);
+ else {
+ _cleanup_free_ char *unescaped = NULL;
+ ssize_t l;
+ PathType type;
+
+ type = path_type_from_string(type_str);
+ if (type < 0) {
+ log_unit_warning(u, "Unknown path type \"%s\", ignoring.", type_str);
+ return 0;
+ }
+
+ l = cunescape(value+skip, 0, &unescaped);
+ if (l < 0) {
+ log_unit_warning_errno(u, l, "Failed to unescape serialize path: %m");
+ return 0;
+ }
+
+ LIST_FOREACH(spec, s, p->specs)
+ if (s->type == type &&
+ path_equal(s->path, unescaped)) {
+
+ s->previous_exists = previous_exists;
+ break;
+ }
+ }
+
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState path_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[PATH(u)->state];
+}
+
+_pure_ static const char *path_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return path_state_to_string(PATH(u)->state);
+}
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ PathSpec *s = userdata, *found = NULL;
+ Path *p;
+ int changed;
+
+ assert(s);
+ assert(s->unit);
+ assert(fd >= 0);
+
+ p = PATH(s->unit);
+
+ if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+ return 0;
+
+ LIST_FOREACH(spec, i, p->specs)
+ if (path_spec_owns_inotify_fd(i, fd)) {
+ found = i;
+ break;
+ }
+
+ if (!found) {
+ log_error("Got event on unknown fd.");
+ goto fail;
+ }
+
+ changed = path_spec_fd_event(found, revents);
+ if (changed < 0)
+ goto fail;
+
+ if (changed)
+ path_enter_running(p, found->path);
+ else
+ path_enter_waiting(p, false, false);
+
+ return 0;
+
+fail:
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer);
+
+static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) {
+ Path *p = ASSERT_PTR(userdata);
+ Unit *trigger;
+
+ assert(s);
+
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (!trigger) {
+ log_unit_error(UNIT(p), "Unit to trigger vanished.");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return 0;
+ }
+
+ path_trigger_notify_impl(UNIT(p), trigger, /* on_defer = */ true);
+ return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(u);
+ assert(other);
+
+ /* Invoked whenever the unit we trigger changes state or gains or loses a job */
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+ /* Don't propagate state changes from the triggered unit if we are already down */
+ if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ path_enter_dead(p, PATH_FAILURE_UNIT_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ if (p->state == PATH_RUNNING &&
+ UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ if (!on_defer)
+ log_unit_debug(u, "Got notified about unit deactivation.");
+ } else if (p->state == PATH_WAITING &&
+ !UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ if (!on_defer)
+ log_unit_debug(u, "Got notified about unit activation.");
+ } else
+ return;
+
+ if (on_defer) {
+ path_enter_waiting(p, /* initial = */ false, /* from_trigger_notify = */ true);
+ return;
+ }
+
+ /* Do not call path_enter_waiting() directly from path_trigger_notify(), as this may be called by
+ * job_install() -> job_finish_and_invalidate() -> unit_trigger_notify(), and path_enter_waiting()
+ * may install another job and will trigger assertion in job_install().
+ * https://github.com/systemd/systemd/issues/24577#issuecomment-1522628906
+ * Hence, first setup defer event source here, and call path_enter_waiting() slightly later. */
+ if (p->trigger_notify_event_source) {
+ r = sd_event_source_set_enabled(p->trigger_notify_event_source, SD_EVENT_ONESHOT);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to enable event source for triggering notify: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+ } else {
+ r = sd_event_add_defer(u->manager->event, &p->trigger_notify_event_source, path_trigger_notify_on_defer, p);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to allocate event source for triggering notify: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+
+ (void) sd_event_source_set_description(p->trigger_notify_event_source, "path-trigger-notify");
+ }
+}
+
+static void path_trigger_notify(Unit *u, Unit *other) {
+ path_trigger_notify_impl(u, other, /* on_defer = */ false);
+}
+
+static void path_reset_failed(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+
+ if (p->state == PATH_FAILED)
+ path_set_state(p, PATH_DEAD);
+
+ p->result = PATH_SUCCESS;
+}
+
+static int path_can_start(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(p);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static void activation_details_path_done(ActivationDetails *details) {
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+ p->trigger_path_filename = mfree(p->trigger_path_filename);
+}
+
+static void activation_details_path_serialize(ActivationDetails *details, FILE *f) {
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+ assert(f);
+
+ if (p->trigger_path_filename)
+ (void) serialize_item(f, "activation-details-path-filename", p->trigger_path_filename);
+}
+
+static int activation_details_path_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ int r;
+
+ assert(key);
+ assert(value);
+
+ if (!details || !*details)
+ return -EINVAL;
+
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(*details);
+ if (!p)
+ return -EINVAL;
+
+ if (!streq(key, "activation-details-path-filename"))
+ return -EINVAL;
+
+ r = free_and_strdup(&p->trigger_path_filename, value);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int activation_details_path_append_env(ActivationDetails *details, char ***strv) {
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ char *s;
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(p);
+
+ if (isempty(p->trigger_path_filename))
+ return 0;
+
+ s = strjoin("TRIGGER_PATH=", p->trigger_path_filename);
+ if (!s)
+ return -ENOMEM;
+
+ r = strv_consume(strv, TAKE_PTR(s));
+ if (r < 0)
+ return r;
+
+ return 1; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) {
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(p);
+
+ if (isempty(p->trigger_path_filename))
+ return 0;
+
+ r = strv_extend(strv, "trigger_path");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, p->trigger_path_filename);
+ if (r < 0)
+ return r;
+
+ return 1; /* Return the number of pairs added to the env block */
+}
+
+static const char* const path_type_table[_PATH_TYPE_MAX] = {
+ [PATH_EXISTS] = "PathExists",
+ [PATH_EXISTS_GLOB] = "PathExistsGlob",
+ [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty",
+ [PATH_CHANGED] = "PathChanged",
+ [PATH_MODIFIED] = "PathModified",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_type, PathType);
+
+static const char* const path_result_table[_PATH_RESULT_MAX] = {
+ [PATH_SUCCESS] = "success",
+ [PATH_FAILURE_RESOURCES] = "resources",
+ [PATH_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [PATH_FAILURE_UNIT_START_LIMIT_HIT] = "unit-start-limit-hit",
+ [PATH_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult);
+
+const UnitVTable path_vtable = {
+ .object_size = sizeof(Path),
+
+ .sections =
+ "Unit\0"
+ "Path\0"
+ "Install\0",
+ .private_section = "Path",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+
+ .init = path_init,
+ .done = path_done,
+ .load = path_load,
+
+ .coldplug = path_coldplug,
+
+ .dump = path_dump,
+
+ .start = path_start,
+ .stop = path_stop,
+
+ .serialize = path_serialize,
+ .deserialize_item = path_deserialize_item,
+
+ .active_state = path_active_state,
+ .sub_state_to_string = path_sub_state_to_string,
+
+ .trigger_notify = path_trigger_notify,
+
+ .reset_failed = path_reset_failed,
+
+ .bus_set_property = bus_path_set_property,
+
+ .can_start = path_can_start,
+};
+
+const ActivationDetailsVTable activation_details_path_vtable = {
+ .object_size = sizeof(ActivationDetailsPath),
+
+ .done = activation_details_path_done,
+ .serialize = activation_details_path_serialize,
+ .deserialize = activation_details_path_deserialize,
+ .append_env = activation_details_path_append_env,
+ .append_pair = activation_details_path_append_pair,
+};
diff --git a/src/core/path.h b/src/core/path.h
new file mode 100644
index 0000000..cb5b662
--- /dev/null
+++ b/src/core/path.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Path Path;
+typedef struct PathSpec PathSpec;
+typedef struct ActivationDetailsPath ActivationDetailsPath;
+
+#include "unit.h"
+
+typedef enum PathType {
+ PATH_EXISTS,
+ PATH_EXISTS_GLOB,
+ PATH_DIRECTORY_NOT_EMPTY,
+ PATH_CHANGED,
+ PATH_MODIFIED,
+ _PATH_TYPE_MAX,
+ _PATH_TYPE_INVALID = -EINVAL,
+} PathType;
+
+typedef struct PathSpec {
+ Unit *unit;
+
+ char *path;
+
+ sd_event_source *event_source;
+
+ LIST_FIELDS(struct PathSpec, spec);
+
+ PathType type;
+ int inotify_fd;
+ int primary_wd;
+
+ bool previous_exists;
+} PathSpec;
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler);
+void path_spec_unwatch(PathSpec *s);
+int path_spec_fd_event(PathSpec *s, uint32_t events);
+void path_spec_done(PathSpec *s);
+
+static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) {
+ return s->inotify_fd == fd;
+}
+
+typedef enum PathResult {
+ PATH_SUCCESS,
+ PATH_FAILURE_RESOURCES,
+ PATH_FAILURE_START_LIMIT_HIT,
+ PATH_FAILURE_UNIT_START_LIMIT_HIT,
+ PATH_FAILURE_TRIGGER_LIMIT_HIT,
+ _PATH_RESULT_MAX,
+ _PATH_RESULT_INVALID = -EINVAL,
+} PathResult;
+
+struct Path {
+ Unit meta;
+
+ LIST_HEAD(PathSpec, specs);
+
+ PathState state, deserialized_state;
+
+ bool make_directory;
+ mode_t directory_mode;
+
+ PathResult result;
+
+ RateLimit trigger_limit;
+
+ sd_event_source *trigger_notify_event_source;
+};
+
+struct ActivationDetailsPath {
+ ActivationDetails meta;
+ char *trigger_path_filename;
+};
+
+void path_free_specs(Path *p);
+
+extern const UnitVTable path_vtable;
+extern const ActivationDetailsVTable activation_details_path_vtable;
+
+const char* path_type_to_string(PathType i) _const_;
+PathType path_type_from_string(const char *s) _pure_;
+
+const char* path_result_to_string(PathResult i) _const_;
+PathResult path_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(PATH, Path);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_PATH, ActivationDetailsPath, PATH);
diff --git a/src/core/restrict-ifaces.c b/src/core/restrict-ifaces.c
new file mode 100644
index 0000000..134f70a
--- /dev/null
+++ b/src/core/restrict-ifaces.c
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "restrict-ifaces.h"
+#include "netlink-util.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_ifaces/restrict-ifaces-skel.h"
+
+static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) {
+ restrict_ifaces_bpf__destroy(obj);
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free);
+
+static int prepare_restrict_ifaces_bpf(
+ Unit* u,
+ bool is_allow_list,
+ const Set *restrict_network_interfaces,
+ struct restrict_ifaces_bpf **ret_object) {
+
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ char *iface;
+ int r, map_fd;
+
+ assert(ret_object);
+
+ obj = restrict_ifaces_bpf__open();
+ if (!obj)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m");
+
+ r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u));
+ if (r != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "restrict-interfaces: Failed to resize BPF map '%s': %m",
+ sym_bpf_map__name(obj->maps.sd_restrictif));
+
+ obj->rodata->is_allow_list = is_allow_list;
+
+ r = restrict_ifaces_bpf__load(obj);
+ if (r != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m");
+
+ map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif);
+
+ SET_FOREACH(iface, restrict_network_interfaces) {
+ uint8_t dummy = 0;
+ int ifindex;
+
+ ifindex = rtnl_resolve_interface(&rtnl, iface);
+ if (ifindex < 0) {
+ log_unit_warning_errno(u, ifindex,
+ "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m",
+ iface);
+ continue;
+ }
+
+ if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY))
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "restrict-interfaces: Failed to update BPF map '%s' fd: %m",
+ sym_bpf_map__name(obj->maps.sd_restrictif));
+ }
+
+ *ret_object = TAKE_PTR(obj);
+ return 0;
+}
+
+int restrict_network_interfaces_supported(void) {
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+
+ if (!cgroup_bpf_supported())
+ return (supported = false);
+
+ if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) {
+ log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported");
+ return (supported = false);
+ }
+
+ r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj);
+ if (r < 0) {
+ log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m");
+ return (supported = false);
+ }
+
+ return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i));
+}
+
+static int restrict_network_interfaces_install_impl(Unit *u) {
+ _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL;
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ _cleanup_free_ char *cgroup_path = NULL;
+ _cleanup_close_ int cgroup_fd = -1;
+ CGroupContext *cc;
+ int r;
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m");
+
+ if (!cc->restrict_network_interfaces)
+ return 0;
+
+ r = prepare_restrict_ifaces_bpf(u,
+ cc->restrict_network_interfaces_is_allow_list,
+ cc->restrict_network_interfaces,
+ &obj);
+ if (r < 0)
+ return r;
+
+ cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0);
+ if (cgroup_fd < 0)
+ return -errno;
+
+ ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd);
+ r = sym_libbpf_get_error(ingress_link);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m");
+
+ egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd);
+ r = sym_libbpf_get_error(egress_link);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m");
+
+ u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link);
+ u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link);
+
+ return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+ int r = restrict_network_interfaces_install_impl(u);
+ fdset_close(u->initial_restric_ifaces_link_fds);
+ return r;
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+
+ r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link);
+ if (r < 0)
+ return r;
+
+ return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link);
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+ int r;
+
+ assert(u);
+
+ if (!u->initial_restric_ifaces_link_fds) {
+ u->initial_restric_ifaces_link_fds = fdset_new();
+ if (!u->initial_restric_ifaces_link_fds)
+ return log_oom();
+ }
+
+ r = fdset_put(u->initial_restric_ifaces_link_fds, fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd);
+
+ return 0;
+}
+
+#else /* ! BPF_FRAMEWORK */
+int restrict_network_interfaces_supported(void) {
+ return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m");
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+ return 0;
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+ return 0;
+}
+#endif
diff --git a/src/core/restrict-ifaces.h b/src/core/restrict-ifaces.h
new file mode 100644
index 0000000..6e7a824
--- /dev/null
+++ b/src/core/restrict-ifaces.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+typedef struct Unit Unit;
+
+int restrict_network_interfaces_supported(void);
+int restrict_network_interfaces_install(Unit *u);
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec.
+ * FDs will be closed at the end of restrict_network_interfaces_install. */
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd);
diff --git a/src/core/scope.c b/src/core/scope.c
new file mode 100644
index 0000000..1973681
--- /dev/null
+++ b/src/core/scope.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "exit-status.h"
+#include "load-dropin.h"
+#include "log.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "scope.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
+ [SCOPE_DEAD] = UNIT_INACTIVE,
+ [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
+ [SCOPE_RUNNING] = UNIT_ACTIVE,
+ [SCOPE_ABANDONED] = UNIT_ACTIVE,
+ [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SCOPE_FAILED] = UNIT_FAILED
+};
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+
+static void scope_init(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->runtime_max_usec = USEC_INFINITY;
+ s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
+ u->ignore_on_isolate = true;
+ s->user = s->group = NULL;
+ s->oom_policy = _OOM_POLICY_INVALID;
+}
+
+static void scope_done(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(u);
+
+ s->controller = mfree(s->controller);
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ s->user = mfree(s->user);
+ s->group = mfree(s->group);
+}
+
+static usec_t scope_running_timeout(Scope *s) {
+ usec_t delta = 0;
+
+ assert(s);
+
+ if (s->runtime_rand_extra_usec != 0) {
+ delta = random_u64_range(s->runtime_rand_extra_usec);
+ log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+ }
+
+ return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+ s->runtime_max_usec),
+ delta);
+}
+
+static int scope_arm_timer(Scope *s, usec_t usec) {
+ int r;
+
+ assert(s);
+
+ if (s->timer_event_source) {
+ r = sd_event_source_set_time(s->timer_event_source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
+ }
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->timer_event_source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ scope_dispatch_timer, s);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s->timer_event_source, "scope-timer");
+
+ return 0;
+}
+
+static void scope_set_state(Scope *s, ScopeState state) {
+ ScopeState old_state;
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_START_CHOWN, SCOPE_RUNNING))
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
+ unit_unwatch_all_pids(UNIT(s));
+ unit_dequeue_rewatch_pids(UNIT(s));
+ }
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static int scope_add_default_dependencies(Scope *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Make sure scopes are unloaded on shutdown */
+ r = unit_add_two_dependencies_by_name(
+ UNIT(s),
+ UNIT_BEFORE, UNIT_CONFLICTS,
+ SPECIAL_SHUTDOWN_TARGET, true,
+ UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int scope_verify(Scope *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (set_isempty(UNIT(s)->pids) &&
+ !MANAGER_IS_RELOADING(UNIT(s)->manager) &&
+ !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), "Scope has no PIDs. Refusing.");
+
+ return 0;
+}
+
+static int scope_load_init_scope(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return 0;
+
+ u->transient = true;
+ u->perpetual = true;
+
+ /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we
+ * synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ /* Prettify things, if we can. */
+ if (!u->description)
+ u->description = strdup("System and Service Manager");
+ if (!u->documentation)
+ (void) strv_extend(&u->documentation, "man:systemd(1)");
+
+ return 1;
+}
+
+static int scope_add_extras(Scope *s) {
+ int r;
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ if (s->oom_policy < 0)
+ s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy;
+
+ s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+ return scope_add_default_dependencies(s);
+}
+
+static int scope_load(Unit *u) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+ /* Refuse to load non-transient scope units, but allow them while reloading. */
+ return -ENOENT;
+
+ r = scope_load_init_scope(u);
+ if (r < 0)
+ return r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = scope_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return scope_verify(s);
+}
+
+static usec_t scope_coldplug_timeout(Scope *s) {
+ assert(s);
+
+ switch (s->deserialized_state) {
+
+ case SCOPE_RUNNING:
+ return scope_running_timeout(s);
+
+ case SCOPE_STOP_SIGKILL:
+ case SCOPE_STOP_SIGTERM:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+ default:
+ return USEC_INFINITY;
+ }
+}
+
+static int scope_coldplug(Unit *u) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SCOPE_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ r = scope_arm_timer(s, scope_coldplug_timeout(s));
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) {
+ if (u->pids) {
+ void *pidp;
+
+ SET_FOREACH(pidp, u->pids) {
+ r = unit_watch_pid(u, PTR_TO_PID(pidp), false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+ } else
+ (void) unit_enqueue_rewatch_pids(u);
+ }
+
+ bus_scope_track_controller(s);
+
+ scope_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static void scope_dump(Unit *u, FILE *f, const char *prefix) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+ assert(f);
+
+ fprintf(f,
+ "%sScope State: %s\n"
+ "%sResult: %s\n"
+ "%sRuntimeMaxSec: %s\n"
+ "%sRuntimeRandomizedExtraSec: %s\n"
+ "%sOOMPolicy: %s\n",
+ prefix, scope_state_to_string(s->state),
+ prefix, scope_result_to_string(s->result),
+ prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+ prefix, oom_policy_to_string(s->oom_policy));
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+}
+
+static void scope_enter_dead(Scope *s, ScopeResult f) {
+ assert(s);
+
+ if (s->result == SCOPE_SUCCESS)
+ s->result = f;
+
+ unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result));
+ scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
+}
+
+static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
+ bool skip_signal = false;
+ int r;
+
+ assert(s);
+
+ if (s->result == SCOPE_SUCCESS)
+ s->result = f;
+
+ /* Before sending any signal, make sure we track all members of this cgroup */
+ (void) unit_watch_all_pids(UNIT(s));
+
+ /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+ * died now */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
+ * directly into SIGTERM berserk mode */
+ if (state == SCOPE_STOP_SIGTERM)
+ skip_signal = bus_scope_send_request_stop(s) > 0;
+
+ if (skip_signal)
+ r = 1; /* wait */
+ else {
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ state != SCOPE_STOP_SIGTERM ? KILL_KILL :
+ s->was_abandoned ? KILL_TERMINATE_AND_LOG :
+ KILL_TERMINATE,
+ -1, -1, false);
+ if (r < 0)
+ goto fail;
+ }
+
+ if (r > 0) {
+ r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
+ if (r < 0)
+ goto fail;
+
+ scope_set_state(s, state);
+ } else if (state == SCOPE_STOP_SIGTERM)
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS);
+ else
+ scope_enter_dead(s, SCOPE_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+}
+
+static int scope_enter_start_chown(Scope *s) {
+ Unit *u = UNIT(s);
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(s->user);
+
+ r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), u->manager->default_timeout_start_usec));
+ if (r < 0)
+ return r;
+
+ r = unit_fork_helper_process(u, "(sd-chown-cgroup)", &pid);
+ if (r < 0)
+ goto fail;
+
+ if (r == 0) {
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+
+ if (!isempty(s->user)) {
+ const char *user = s->user;
+
+ r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve user \"%s\": %m", user);
+ _exit(EXIT_USER);
+ }
+ }
+
+ if (!isempty(s->group)) {
+ const char *group = s->group;
+
+ r = get_group_creds(&group, &gid, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve group \"%s\": %m", group);
+ _exit(EXIT_GROUP);
+ }
+ }
+
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m");
+ _exit(EXIT_CGROUP);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ r = unit_watch_pid(UNIT(s), pid, true);
+ if (r < 0)
+ goto fail;
+
+ scope_set_state(s, SCOPE_START_CHOWN);
+
+ return 1;
+fail:
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int scope_enter_running(Scope *s) {
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+
+ (void) bus_scope_track_controller(s);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ unit_export_state_files(u);
+
+ r = unit_attach_pids_to_cgroup(u, u->pids, NULL);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to add PIDs to scope's control group: %m");
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+ return r;
+ }
+ if (r == 0) {
+ log_unit_warning(u, "No PIDs left to attach to the scope's control group, refusing.");
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+ return -ECHILD;
+ }
+ log_unit_debug(u, "%i %s added to scope's control group.", r, r == 1 ? "process" : "processes");
+
+ s->result = SCOPE_SUCCESS;
+
+ scope_set_state(s, SCOPE_RUNNING);
+
+ /* Set the maximum runtime timeout. */
+ scope_arm_timer(s, scope_running_timeout(s));
+
+ /* On unified we use proper notifications hence we can unwatch the PIDs
+ * we just attached to the scope. This can also be done on legacy as
+ * we're going to update the list of the processes we watch with the
+ * PIDs currently in the scope anyway. */
+ unit_unwatch_all_pids(u);
+
+ /* Start watching the PIDs currently in the scope (legacy hierarchy only) */
+ (void) unit_enqueue_rewatch_pids(u);
+ return 1;
+}
+
+static int scope_start(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return -EPERM;
+
+ if (s->state == SCOPE_FAILED)
+ return -EPERM;
+
+ /* We can't fulfill this right now, please try again later */
+ if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ return -EAGAIN;
+
+ assert(s->state == SCOPE_DEAD);
+
+ if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+ return -ENOENT;
+
+ (void) unit_realize_cgroup(u);
+ (void) unit_reset_accounting(u);
+
+ /* We check only for User= option to keep behavior consistent with logic for service units,
+ * i.e. having 'Delegate=true Group=foo' w/o specifying User= has no effect. */
+ if (s->user && unit_cgroup_delegate(u))
+ return scope_enter_start_chown(s);
+
+ return scope_enter_running(s);
+}
+
+static int scope_stop(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ return 0;
+
+ assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED));
+
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
+ return 1;
+}
+
+static void scope_reset_failed(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (s->state == SCOPE_FAILED)
+ scope_set_state(s, SCOPE_DEAD);
+
+ s->result = SCOPE_SUCCESS;
+}
+
+static int scope_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ return unit_kill_common(u, who, signo, -1, -1, error);
+}
+
+static int scope_get_timeout(Unit *u, usec_t *timeout) {
+ Scope *s = SCOPE(u);
+ usec_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Scope *s = SCOPE(u);
+ void *pidp;
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", scope_state_to_string(s->state));
+ (void) serialize_bool(f, "was-abandoned", s->was_abandoned);
+
+ if (s->controller)
+ (void) serialize_item(f, "controller", s->controller);
+
+ SET_FOREACH(pidp, u->pids)
+ serialize_item_format(f, "pids", PID_FMT, PTR_TO_PID(pidp));
+
+ return 0;
+}
+
+static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ ScopeState state;
+
+ state = scope_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else if (streq(key, "was-abandoned")) {
+ int k;
+
+ k = parse_boolean(value);
+ if (k < 0)
+ log_unit_debug(u, "Failed to parse boolean value: %s", value);
+ else
+ s->was_abandoned = k;
+ } else if (streq(key, "controller")) {
+
+ r = free_and_strdup(&s->controller, value);
+ if (r < 0)
+ return log_oom();
+
+ } else if (streq(key, "pids")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse pids value: %s", value);
+ else {
+ r = set_ensure_put(&u->pids, NULL, PID_TO_PTR(pid));
+ if (r < 0)
+ return r;
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void scope_notify_cgroup_empty_event(Unit *u) {
+ Scope *s = SCOPE(u);
+ assert(u);
+
+ log_unit_debug(u, "cgroup is empty");
+
+ if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ scope_enter_dead(s, SCOPE_SUCCESS);
+
+ /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
+ * up the cgroup earlier and should do it now. */
+ if (IN_SET(s->state, SCOPE_DEAD, SCOPE_FAILED))
+ unit_prune_cgroup(u);
+}
+
+static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+ Scope *s = SCOPE(u);
+
+ if (managed_oom)
+ log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+ else
+ log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+ if (s->oom_policy == OOM_CONTINUE)
+ return;
+
+ switch (s->state) {
+
+ case SCOPE_START_CHOWN:
+ case SCOPE_RUNNING:
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_OOM_KILL);
+ break;
+
+ case SCOPE_STOP_SIGTERM:
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_OOM_KILL);
+ break;
+
+ case SCOPE_STOP_SIGKILL:
+ if (s->result == SCOPE_SUCCESS)
+ s->result = SCOPE_FAILURE_OOM_KILL;
+ break;
+ /* SCOPE_DEAD, SCOPE_ABANDONED, and SCOPE_FAILED end up in default */
+ default:
+ ;
+ }
+}
+
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (s->state == SCOPE_START_CHOWN) {
+ if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+ else
+ scope_enter_running(s);
+ return;
+ }
+
+ /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
+ * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
+ * process we watched was probably the parent of them, and they are hence now our children. */
+
+ (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Scope *s = SCOPE(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SCOPE_RUNNING:
+ log_unit_warning(UNIT(s), "Scope reached runtime time limit. Stopping.");
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ case SCOPE_STOP_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ }
+
+ break;
+
+ case SCOPE_STOP_SIGKILL:
+ log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ case SCOPE_START_CHOWN:
+ log_unit_warning(UNIT(s), "User lookup timed out. Entering failed state.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int scope_abandon(Scope *s) {
+ assert(s);
+
+ if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+ return -EPERM;
+
+ if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
+ return -ESTALE;
+
+ s->was_abandoned = true;
+
+ s->controller = mfree(s->controller);
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ scope_set_state(s, SCOPE_ABANDONED);
+
+ /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
+ * the remaining processes will be sooner or later reassigned to us as parent. */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ return 0;
+}
+
+_pure_ static UnitActiveState scope_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SCOPE(u)->state];
+}
+
+_pure_ static const char *scope_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return scope_state_to_string(SCOPE(u)->state);
+}
+
+static void scope_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ /* Let's unconditionally add the "init.scope" special unit
+ * that encapsulates PID 1. Note that PID 1 already is in the
+ * cgroup for this, we hence just need to allocate the object
+ * for it and that's it. */
+
+ u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m");
+ return;
+ }
+ }
+
+ u->transient = true;
+ u->perpetual = true;
+ SCOPE(u)->deserialized_state = SCOPE_RUNNING;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+ /* Enqueue an explicit cgroup realization here. Unlike other cgroups this one already exists and is
+ * populated (by us, after all!) already, even when we are not in a reload cycle. Hence we cannot
+ * apply the settings at creation time anymore, but let's at least apply them asynchronously. */
+ unit_add_to_cgroup_realize_queue(u);
+}
+
+static const char* const scope_result_table[_SCOPE_RESULT_MAX] = {
+ [SCOPE_SUCCESS] = "success",
+ [SCOPE_FAILURE_RESOURCES] = "resources",
+ [SCOPE_FAILURE_TIMEOUT] = "timeout",
+ [SCOPE_FAILURE_OOM_KILL] = "oom-kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult);
+
+const UnitVTable scope_vtable = {
+ .object_size = sizeof(Scope),
+ .cgroup_context_offset = offsetof(Scope, cgroup_context),
+ .kill_context_offset = offsetof(Scope, kill_context),
+
+ .sections =
+ "Unit\0"
+ "Scope\0"
+ "Install\0",
+ .private_section = "Scope",
+
+ .can_transient = true,
+ .can_delegate = true,
+ .can_fail = true,
+ .once_only = true,
+ .can_set_managed_oom = true,
+
+ .init = scope_init,
+ .load = scope_load,
+ .done = scope_done,
+
+ .coldplug = scope_coldplug,
+
+ .dump = scope_dump,
+
+ .start = scope_start,
+ .stop = scope_stop,
+
+ .kill = scope_kill,
+
+ .freeze = unit_freeze_vtable_common,
+ .thaw = unit_thaw_vtable_common,
+
+ .get_timeout = scope_get_timeout,
+
+ .serialize = scope_serialize,
+ .deserialize_item = scope_deserialize_item,
+
+ .active_state = scope_active_state,
+ .sub_state_to_string = scope_sub_state_to_string,
+
+ .sigchld_event = scope_sigchld_event,
+
+ .reset_failed = scope_reset_failed,
+
+ .notify_cgroup_empty = scope_notify_cgroup_empty_event,
+ .notify_cgroup_oom = scope_notify_cgroup_oom_event,
+
+ .bus_set_property = bus_scope_set_property,
+ .bus_commit_properties = bus_scope_commit_properties,
+
+ .enumerate_perpetual = scope_enumerate_perpetual,
+};
diff --git a/src/core/scope.h b/src/core/scope.h
new file mode 100644
index 0000000..c9574a3
--- /dev/null
+++ b/src/core/scope.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Scope Scope;
+
+#include "cgroup.h"
+#include "kill.h"
+#include "unit.h"
+
+typedef enum ScopeResult {
+ SCOPE_SUCCESS,
+ SCOPE_FAILURE_RESOURCES,
+ SCOPE_FAILURE_TIMEOUT,
+ SCOPE_FAILURE_OOM_KILL,
+ _SCOPE_RESULT_MAX,
+ _SCOPE_RESULT_INVALID = -EINVAL,
+} ScopeResult;
+
+struct Scope {
+ Unit meta;
+
+ CGroupContext cgroup_context;
+ KillContext kill_context;
+
+ ScopeState state, deserialized_state;
+ ScopeResult result;
+
+ usec_t runtime_max_usec;
+ usec_t runtime_rand_extra_usec;
+ usec_t timeout_stop_usec;
+
+ char *controller;
+ sd_bus_track *controller_track;
+
+ bool was_abandoned;
+
+ sd_event_source *timer_event_source;
+
+ char *user;
+ char *group;
+
+ OOMPolicy oom_policy;
+};
+
+extern const UnitVTable scope_vtable;
+
+int scope_abandon(Scope *s);
+
+const char* scope_result_to_string(ScopeResult i) _const_;
+ScopeResult scope_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SCOPE, Scope);
diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c
new file mode 100644
index 0000000..11dbf46
--- /dev/null
+++ b/src/core/selinux-access.c
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "selinux-access.h"
+
+#if HAVE_SELINUX
+
+#include <errno.h>
+#include <selinux/avc.h>
+#include <selinux/selinux.h>
+#if HAVE_AUDIT
+#include <libaudit.h>
+#endif
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "bus-util.h"
+#include "errno-util.h"
+#include "format-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "util.h"
+
+static bool initialized = false;
+
+struct audit_info {
+ sd_bus_creds *creds;
+ const char *path;
+ const char *cmdline;
+ const char *function;
+};
+
+/*
+ Any time an access gets denied this callback will be called
+ with the audit data. We then need to just copy the audit data into the msgbuf.
+*/
+static int audit_callback(
+ void *auditdata,
+ security_class_t cls,
+ char *msgbuf,
+ size_t msgbufsize) {
+
+ const struct audit_info *audit = auditdata;
+ uid_t uid = 0, login_uid = 0;
+ gid_t gid = 0;
+ char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+ char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+ char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a";
+
+ if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0)
+ xsprintf(login_uid_buf, UID_FMT, login_uid);
+ if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0)
+ xsprintf(uid_buf, UID_FMT, uid);
+ if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0)
+ xsprintf(gid_buf, GID_FMT, gid);
+
+ (void) snprintf(msgbuf, msgbufsize,
+ "auid=%s uid=%s gid=%s%s%s%s%s%s%s%s%s%s",
+ login_uid_buf, uid_buf, gid_buf,
+ audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "",
+ audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : "",
+ audit->function ? " function=\"" : "", strempty(audit->function), audit->function ? "\"" : "");
+
+ return 0;
+}
+
+static int callback_type_to_priority(int type) {
+ switch (type) {
+
+ case SELINUX_ERROR:
+ return LOG_ERR;
+
+ case SELINUX_WARNING:
+ return LOG_WARNING;
+
+ case SELINUX_INFO:
+ return LOG_INFO;
+
+ case SELINUX_AVC:
+ default:
+ return LOG_NOTICE;
+ }
+}
+
+/*
+ libselinux uses this callback when access gets denied or other
+ events happen. If audit is turned on, messages will be reported
+ using audit netlink, otherwise they will be logged using the usual
+ channels.
+
+ Code copied from dbus and modified.
+*/
+_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) {
+ va_list ap;
+ const char *fmt2;
+
+#if HAVE_AUDIT
+ int fd;
+
+ fd = get_audit_fd();
+
+ if (fd >= 0) {
+ _cleanup_free_ char *buf = NULL;
+ int r;
+
+ va_start(ap, fmt);
+ r = vasprintf(&buf, fmt, ap);
+ va_end(ap);
+
+ if (r >= 0) {
+ if (type == SELINUX_AVC)
+ audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_AVC, buf, NULL, NULL, NULL, getuid());
+ else if (type == SELINUX_ERROR)
+ audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_SELINUX_ERR, buf, NULL, NULL, NULL, getuid());
+
+ return 0;
+ }
+ }
+#endif
+
+ fmt2 = strjoina("selinux: ", fmt);
+
+ va_start(ap, fmt);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_internalv(LOG_AUTH | callback_type_to_priority(type),
+ 0, PROJECT_FILE, __LINE__, __func__,
+ fmt2, ap);
+ REENABLE_WARNING;
+ va_end(ap);
+
+ return 0;
+}
+
+static int access_init(sd_bus_error *error) {
+ int r;
+
+ if (!mac_selinux_use())
+ return 0;
+
+ if (initialized)
+ return 1;
+
+ if (avc_open(NULL, 0) != 0) {
+ r = -errno; /* Save original errno for later */
+
+ bool enforce = security_getenforce() != 0;
+ log_full_errno(enforce ? LOG_ERR : LOG_WARNING, r, "Failed to open the SELinux AVC: %m");
+
+ /* If enforcement isn't on, then let's suppress this error, and just don't do any AVC checks.
+ * The warning we printed is hence all the admin will see. */
+ if (!enforce)
+ return 0;
+
+ /* Return an access denied error based on the original errno, if we couldn't load the AVC but
+ * enforcing mode was on, or we couldn't determine whether it is one. */
+ errno = -r;
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %m");
+ }
+
+ selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) { .func_audit = audit_callback });
+ selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) { .func_log = log_callback });
+
+ initialized = true;
+ return 1;
+}
+
+/*
+ This function communicates with the kernel to check whether or not it should
+ allow the access.
+ If the machine is in permissive mode it will return ok. Audit messages will
+ still be generated if the access would be denied in enforcing mode.
+*/
+int mac_selinux_access_check_internal(
+ sd_bus_message *message,
+ const char *unit_path,
+ const char *unit_context,
+ const char *permission,
+ const char *function,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ const char *tclass, *scon, *acon;
+ _cleanup_free_ char *cl = NULL;
+ _cleanup_freecon_ char *fcon = NULL;
+ char **cmdline = NULL;
+ bool enforce;
+ int r = 0;
+
+ assert(message);
+ assert(permission);
+ assert(function);
+ assert(error);
+
+ r = access_init(error);
+ if (r <= 0)
+ return r;
+
+ /* delay call until we checked in `access_init()` if SELinux is actually enabled */
+ enforce = mac_selinux_enforcing();
+
+ r = sd_bus_query_sender_creds(
+ message,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID|
+ SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID|
+ SD_BUS_CREDS_SELINUX_CONTEXT|
+ SD_BUS_CREDS_AUGMENT /* get more bits from /proc */,
+ &creds);
+ if (r < 0)
+ return r;
+
+ /* The SELinux context is something we really should have gotten directly from the message or sender,
+ * and not be an augmented field. If it was augmented we cannot use it for authorization, since this
+ * is racy and vulnerable. Let's add an extra check, just in case, even though this really shouldn't
+ * be possible. */
+ assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM);
+
+ r = sd_bus_creds_get_selinux_context(creds, &scon);
+ if (r < 0)
+ return r;
+
+ if (unit_context) {
+ /* Nice! The unit comes with a SELinux context read from the unit file */
+ acon = unit_context;
+ tclass = "service";
+ } else {
+ /* If no unit context is known, use our own */
+ if (getcon_raw(&fcon) < 0) {
+ log_warning_errno(errno, "SELinux getcon_raw() failed%s (perm=%s): %m",
+ enforce ? "" : ", ignoring",
+ permission);
+ if (!enforce)
+ return 0;
+
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context: %m");
+ }
+ if (!fcon) {
+ if (!enforce)
+ return 0;
+
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "We appear not to have any SELinux context: %m");
+ }
+
+ acon = fcon;
+ tclass = "system";
+ }
+
+ sd_bus_creds_get_cmdline(creds, &cmdline);
+ cl = strv_join(cmdline, " ");
+
+ struct audit_info audit_info = {
+ .creds = creds,
+ .path = unit_path,
+ .cmdline = cl,
+ .function = function,
+ };
+
+ r = selinux_check_access(scon, acon, tclass, permission, &audit_info);
+ if (r < 0) {
+ errno = -(r = errno_or_else(EPERM));
+
+ if (enforce)
+ sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access: %m");
+ }
+
+ log_full_errno_zerook(LOG_DEBUG, r,
+ "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m",
+ scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl)));
+ return enforce ? r : 0;
+}
+
+#else /* HAVE_SELINUX */
+
+int mac_selinux_access_check_internal(
+ sd_bus_message *message,
+ const char *unit_path,
+ const char *unit_label,
+ const char *permission,
+ const char *function,
+ sd_bus_error *error) {
+
+ return 0;
+}
+
+#endif /* HAVE_SELINUX */
diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h
new file mode 100644
index 0000000..dc8da9e
--- /dev/null
+++ b/src/core/selinux-access.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int mac_selinux_access_check_internal(sd_bus_message *message, const char *unit_path, const char *unit_label, const char *permission, const char *function, sd_bus_error *error);
+
+#define mac_selinux_access_check(message, permission, error) \
+ mac_selinux_access_check_internal((message), NULL, NULL, (permission), __func__, (error))
+
+#define mac_selinux_unit_access_check(unit, message, permission, error) \
+ mac_selinux_access_check_internal((message), (unit)->fragment_path, (unit)->access_selinux_context, (permission), __func__, (error))
diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c
new file mode 100644
index 0000000..3f873ba
--- /dev/null
+++ b/src/core/selinux-setup.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#if HAVE_SELINUX
+#include <selinux/selinux.h>
+#endif
+
+#include "log.h"
+#include "macro.h"
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "util.h"
+
+#if HAVE_SELINUX
+_printf_(2,3)
+static int null_log(int type, const char *fmt, ...) {
+ return 0;
+}
+#endif
+
+int mac_selinux_setup(bool *loaded_policy) {
+
+#if HAVE_SELINUX
+ int enforce = 0;
+ usec_t before_load, after_load;
+ char *con;
+ int r;
+ bool initialized = false;
+
+ assert(loaded_policy);
+
+ /* Turn off all of SELinux' own logging, we want to do that */
+ selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) { .func_log = null_log });
+
+ /* Don't load policy in the initrd if we don't appear to have
+ * it. For the real root, we check below if we've already
+ * loaded policy, and return gracefully.
+ */
+ if (in_initrd() && access(selinux_path(), F_OK) < 0)
+ return 0;
+
+ /* Already initialized by somebody else? */
+ r = getcon_raw(&con);
+ /* getcon_raw can return 0, and still give us a NULL pointer if
+ * /proc/self/attr/current is empty. SELinux guarantees this won't
+ * happen, but that file isn't specific to SELinux, and may be provided
+ * by some other arbitrary LSM with different semantics. */
+ if (r == 0 && con) {
+ initialized = !streq(con, "kernel");
+ freecon(con);
+ }
+
+ /* Make sure we have no fds open while loading the policy and
+ * transitioning */
+ log_close();
+
+ /* Now load the policy */
+ before_load = now(CLOCK_MONOTONIC);
+ r = selinux_init_load_policy(&enforce);
+ if (r == 0) {
+ _cleanup_(mac_selinux_freep) char *label = NULL;
+
+ mac_selinux_retest();
+
+ /* Transition to the new context */
+ r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label);
+ if (r < 0 || !label) {
+ log_open();
+ log_error("Failed to compute init label, ignoring.");
+ } else {
+ r = setcon_raw(label);
+
+ log_open();
+ if (r < 0)
+ log_error("Failed to transition into init label '%s', ignoring.", label);
+ }
+
+ after_load = now(CLOCK_MONOTONIC);
+
+ log_info("Successfully loaded SELinux policy in %s.",
+ FORMAT_TIMESPAN(after_load - before_load, 0));
+
+ *loaded_policy = true;
+
+ } else {
+ log_open();
+
+ if (enforce > 0) {
+ if (!initialized)
+ return log_emergency_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to load SELinux policy.");
+
+ log_warning("Failed to load new SELinux policy. Continuing with old policy.");
+ } else
+ log_debug("Unable to load SELinux policy. Ignoring.");
+ }
+#endif
+
+ return 0;
+}
diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h
new file mode 100644
index 0000000..cdff51d
--- /dev/null
+++ b/src/core/selinux-setup.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+int mac_selinux_setup(bool *loaded_policy);
diff --git a/src/core/service.c b/src/core/service.c
new file mode 100644
index 0000000..1d5b9ff
--- /dev/null
+++ b/src/core/service.c
@@ -0,0 +1,4825 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "chase-symlinks.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "def.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "manager.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "utf8.h"
+#include "util.h"
+
+#define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__)
+
+static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVATING,
+ [SERVICE_START_PRE] = UNIT_ACTIVATING,
+ [SERVICE_START] = UNIT_ACTIVATING,
+ [SERVICE_START_POST] = UNIT_ACTIVATING,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+/* For Type=idle we never want to delay any other jobs, hence we
+ * consider idle jobs active as soon as we start working on them */
+static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = {
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVE,
+ [SERVICE_START_PRE] = UNIT_ACTIVE,
+ [SERVICE_START] = UNIT_ACTIVE,
+ [SERVICE_START_POST] = UNIT_ACTIVE,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
+static void service_enter_reload_by_notify(Service *s);
+
+static void service_init(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->timeout_start_usec = u->manager->default_timeout_start_usec;
+ s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
+ s->timeout_abort_usec = u->manager->default_timeout_abort_usec;
+ s->timeout_abort_set = u->manager->default_timeout_abort_set;
+ s->restart_usec = u->manager->default_restart_usec;
+ s->runtime_max_usec = USEC_INFINITY;
+ s->type = _SERVICE_TYPE_INVALID;
+ s->socket_fd = -1;
+ s->stdin_fd = s->stdout_fd = s->stderr_fd = -1;
+ s->guess_main_pid = true;
+
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ?
+ EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT;
+
+ s->watchdog_original_usec = USEC_INFINITY;
+
+ s->oom_policy = _OOM_POLICY_INVALID;
+}
+
+static void service_unwatch_control_pid(Service *s) {
+ assert(s);
+
+ if (s->control_pid <= 0)
+ return;
+
+ unit_unwatch_pid(UNIT(s), TAKE_PID(s->control_pid));
+}
+
+static void service_unwatch_main_pid(Service *s) {
+ assert(s);
+
+ if (s->main_pid <= 0)
+ return;
+
+ unit_unwatch_pid(UNIT(s), TAKE_PID(s->main_pid));
+}
+
+static void service_unwatch_pid_file(Service *s) {
+ if (!s->pid_file_pathspec)
+ return;
+
+ log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path);
+ path_spec_unwatch(s->pid_file_pathspec);
+ path_spec_done(s->pid_file_pathspec);
+ s->pid_file_pathspec = mfree(s->pid_file_pathspec);
+}
+
+static int service_set_main_pid(Service *s, pid_t pid) {
+ assert(s);
+
+ if (pid <= 1)
+ return -EINVAL;
+
+ if (pid == getpid_cached())
+ return -EINVAL;
+
+ if (s->main_pid == pid && s->main_pid_known)
+ return 0;
+
+ if (s->main_pid != pid) {
+ service_unwatch_main_pid(s);
+ exec_status_start(&s->main_exec_status, pid);
+ }
+
+ s->main_pid = pid;
+ s->main_pid_known = true;
+ s->main_pid_alien = pid_is_my_child(pid) == 0;
+
+ if (s->main_pid_alien)
+ log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", pid);
+
+ return 0;
+}
+
+void service_close_socket_fd(Service *s) {
+ assert(s);
+
+ /* Undo the effect of service_set_socket_fd(). */
+
+ s->socket_fd = asynchronous_close(s->socket_fd);
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket)));
+ unit_ref_unset(&s->accept_socket);
+ }
+
+ s->socket_peer = socket_peer_unref(s->socket_peer);
+}
+
+static void service_stop_watchdog(Service *s) {
+ assert(s);
+
+ s->watchdog_event_source = sd_event_source_disable_unref(s->watchdog_event_source);
+ s->watchdog_timestamp = DUAL_TIMESTAMP_NULL;
+}
+
+static void service_start_watchdog(Service *s) {
+ usec_t watchdog_usec;
+ int r;
+
+ assert(s);
+
+ watchdog_usec = service_get_watchdog_usec(s);
+ if (!timestamp_is_set(watchdog_usec)) {
+ service_stop_watchdog(s);
+ return;
+ }
+
+ if (s->watchdog_event_source) {
+ r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec));
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m");
+ return;
+ }
+
+ r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT);
+ } else {
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->watchdog_event_source,
+ CLOCK_MONOTONIC,
+ usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0,
+ service_dispatch_watchdog, s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m");
+ return;
+ }
+
+ (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog");
+
+ /* Let's process everything else which might be a sign
+ * of living before we consider a service died. */
+ r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE);
+ }
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m");
+}
+
+static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) {
+ usec_t current;
+ int r;
+
+ assert(s);
+
+ /* Extends the specified event source timer to at least the specified time, unless it is already later
+ * anyway. */
+
+ if (!source)
+ return;
+
+ r = sd_event_source_get_time(source, &current);
+ if (r < 0) {
+ const char *desc;
+ (void) sd_event_source_get_description(s->timer_event_source, &desc);
+ log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc));
+ return;
+ }
+
+ if (current >= extended) /* Current timeout is already longer, ignore this. */
+ return;
+
+ r = sd_event_source_set_time(source, extended);
+ if (r < 0) {
+ const char *desc;
+ (void) sd_event_source_get_description(s->timer_event_source, &desc);
+ log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for event source '%s', ignoring %m", strna(desc));
+ }
+}
+
+static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) {
+ usec_t extended;
+
+ assert(s);
+
+ if (!timestamp_is_set(extend_timeout_usec))
+ return;
+
+ extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec);
+
+ service_extend_event_source_timeout(s, s->timer_event_source, extended);
+ service_extend_event_source_timeout(s, s->watchdog_event_source, extended);
+}
+
+static void service_reset_watchdog(Service *s) {
+ assert(s);
+
+ dual_timestamp_get(&s->watchdog_timestamp);
+ service_start_watchdog(s);
+}
+
+static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) {
+ assert(s);
+
+ s->watchdog_override_enable = true;
+ s->watchdog_override_usec = watchdog_override_usec;
+ service_reset_watchdog(s);
+
+ log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec);
+ log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec);
+}
+
+static void service_fd_store_unlink(ServiceFDStore *fs) {
+
+ if (!fs)
+ return;
+
+ if (fs->service) {
+ assert(fs->service->n_fd_store > 0);
+ LIST_REMOVE(fd_store, fs->service->fd_store, fs);
+ fs->service->n_fd_store--;
+ }
+
+ sd_event_source_disable_unref(fs->event_source);
+
+ free(fs->fdname);
+ safe_close(fs->fd);
+ free(fs);
+}
+
+static void service_release_fd_store(Service *s) {
+ assert(s);
+
+ if (s->n_keep_fd_store > 0)
+ return;
+
+ log_unit_debug(UNIT(s), "Releasing all stored fds");
+ while (s->fd_store)
+ service_fd_store_unlink(s->fd_store);
+
+ assert(s->n_fd_store == 0);
+}
+
+static void service_release_resources(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ if (!s->fd_store && s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0)
+ return;
+
+ log_unit_debug(u, "Releasing resources.");
+
+ s->stdin_fd = safe_close(s->stdin_fd);
+ s->stdout_fd = safe_close(s->stdout_fd);
+ s->stderr_fd = safe_close(s->stderr_fd);
+
+ service_release_fd_store(s);
+}
+
+static void service_done(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ s->pid_file = mfree(s->pid_file);
+ s->status_text = mfree(s->status_text);
+
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, false);
+ exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+ s->main_command = NULL;
+
+ dynamic_creds_unref(&s->dynamic_creds);
+
+ exit_status_set_free(&s->restart_prevent_status);
+ exit_status_set_free(&s->restart_force_status);
+ exit_status_set_free(&s->success_status);
+
+ /* This will leak a process, but at least no memory or any of
+ * our resources */
+ service_unwatch_main_pid(s);
+ service_unwatch_control_pid(s);
+ service_unwatch_pid_file(s);
+
+ if (s->bus_name) {
+ unit_unwatch_bus_name(u, s->bus_name);
+ s->bus_name = mfree(s->bus_name);
+ }
+
+ s->bus_name_owner = mfree(s->bus_name_owner);
+
+ s->usb_function_descriptors = mfree(s->usb_function_descriptors);
+ s->usb_function_strings = mfree(s->usb_function_strings);
+
+ service_close_socket_fd(s);
+
+ unit_ref_unset(&s->accept_socket);
+
+ service_stop_watchdog(s);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ service_release_resources(u);
+}
+
+static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+ ServiceFDStore *fs = ASSERT_PTR(userdata);
+
+ assert(e);
+
+ /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */
+ log_unit_debug(UNIT(fs->service),
+ "Received %s on stored fd %d (%s), closing.",
+ revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP",
+ fs->fd, strna(fs->fdname));
+ service_fd_store_unlink(fs);
+ return 0;
+}
+
+static int service_add_fd_store(Service *s, int fd, const char *name, bool do_poll) {
+ ServiceFDStore *fs;
+ int r;
+
+ /* fd is always consumed if we return >= 0 */
+
+ assert(s);
+ assert(fd >= 0);
+
+ if (s->n_fd_store >= s->n_fd_store_max)
+ return -EXFULL; /* Our store is full.
+ * Use this errno rather than E[NM]FILE to distinguish from
+ * the case where systemd itself hits the file limit. */
+
+ LIST_FOREACH(fd_store, i, s->fd_store) {
+ r = same_fd(i->fd, fd);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ safe_close(fd);
+ return 0; /* fd already included */
+ }
+ }
+
+ fs = new(ServiceFDStore, 1);
+ if (!fs)
+ return -ENOMEM;
+
+ *fs = (ServiceFDStore) {
+ .fd = fd,
+ .service = s,
+ .do_poll = do_poll,
+ .fdname = strdup(name ?: "stored"),
+ };
+
+ if (!fs->fdname) {
+ free(fs);
+ return -ENOMEM;
+ }
+
+ if (do_poll) {
+ r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fd, 0, on_fd_store_io, fs);
+ if (r < 0 && r != -EPERM) { /* EPERM indicates fds that aren't pollable, which is OK */
+ free(fs->fdname);
+ free(fs);
+ return r;
+ } else if (r >= 0)
+ (void) sd_event_source_set_description(fs->event_source, "service-fd-store");
+ }
+
+ LIST_PREPEND(fd_store, s->fd_store, fs);
+ s->n_fd_store++;
+
+ return 1; /* fd newly stored */
+}
+
+static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) {
+ int r;
+
+ assert(s);
+
+ while (fdset_size(fds) > 0) {
+ _cleanup_close_ int fd = -1;
+
+ fd = fdset_steal_first(fds);
+ if (fd < 0)
+ break;
+
+ r = service_add_fd_store(s, fd, name, do_poll);
+ if (r == -EXFULL)
+ return log_unit_warning_errno(UNIT(s), r,
+ "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.",
+ s->n_fd_store_max);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m");
+ if (r > 0)
+ log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fd, strna(name));
+ fd = -1;
+ }
+
+ return 0;
+}
+
+static void service_remove_fd_store(Service *s, const char *name) {
+ assert(s);
+ assert(name);
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ if (!streq(fs->fdname, name))
+ continue;
+
+ log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name);
+ service_fd_store_unlink(fs);
+ }
+}
+
+static usec_t service_running_timeout(Service *s) {
+ usec_t delta = 0;
+
+ assert(s);
+
+ if (s->runtime_rand_extra_usec != 0) {
+ delta = random_u64_range(s->runtime_rand_extra_usec);
+ log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+ }
+
+ return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+ s->runtime_max_usec),
+ delta);
+}
+
+static int service_arm_timer(Service *s, usec_t usec) {
+ int r;
+
+ assert(s);
+
+ if (s->timer_event_source) {
+ r = sd_event_source_set_time(s->timer_event_source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
+ }
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->timer_event_source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ service_dispatch_timer, s);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s->timer_event_source, "service-timer");
+
+ return 0;
+}
+
+static int service_verify(Service *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++)
+ LIST_FOREACH(command, command, s->exec_command[c]) {
+ if (!path_is_absolute(command->path) && !filename_is_valid(command->path))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+ "Service %s= binary path \"%s\" is neither a valid executable name nor an absolute path. Refusing.",
+ command->path,
+ service_exec_command_to_string(c));
+ if (strv_isempty(command->argv))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+ "Service has an empty argv in %s=. Refusing.",
+ service_exec_command_to_string(c));
+ }
+
+ if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] &&
+ UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+ /* FailureAction= only makes sense if one of the start or stop commands is specified.
+ * SuccessAction= will be executed unconditionally if no commands are specified. Hence,
+ * either a command or SuccessAction= are required. */
+
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing.");
+
+ if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START])
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+ if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing.");
+
+ if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT &&
+ !IN_SET(s->restart, SERVICE_RESTART_NO, SERVICE_RESTART_ON_FAILURE, SERVICE_RESTART_ON_ABNORMAL, SERVICE_RESTART_ON_WATCHDOG, SERVICE_RESTART_ON_ABORT))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_DBUS && !s->bus_name)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
+
+ if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing.");
+
+ if (s->usb_function_descriptors && !s->usb_function_strings)
+ log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
+
+ if (!s->usb_function_descriptors && s->usb_function_strings)
+ log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring.");
+
+ if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT)
+ log_unit_warning(UNIT(s), "RuntimeMaxSec= has no effect in combination with Type=oneshot. Ignoring.");
+
+ if (s->runtime_max_usec == USEC_INFINITY && s->runtime_rand_extra_usec != 0)
+ log_unit_warning(UNIT(s), "Service has RuntimeRandomizedExtraSec= setting, but no RuntimeMaxSec=. Ignoring.");
+
+ if (s->exit_type == SERVICE_EXIT_CGROUP && cg_unified() < CGROUP_UNIFIED_SYSTEMD)
+ log_unit_warning(UNIT(s), "Service has ExitType=cgroup set, but we are running with legacy cgroups v1, which might not work correctly. Continuing.");
+
+ return 0;
+}
+
+static int service_add_default_dependencies(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Add a number of automatic dependencies useful for the
+ * majority of services. */
+
+ if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+ /* First, pull in the really early boot stuff, and
+ * require it, so that we fail if we can't acquire
+ * it. */
+
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ } else {
+
+ /* In the --user instance there's no sysinit.target,
+ * in that case require basic.target instead. */
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ /* Second, if the rest of the base system is in the same
+ * transaction, order us after it, but do not pull it in or
+ * even require it. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ /* Third, add us in for normal shutdown. */
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static void service_fix_stdio(Service *s) {
+ assert(s);
+
+ /* Note that EXEC_INPUT_NULL and EXEC_OUTPUT_INHERIT play a special role here: they are both the
+ * default value that is subject to automatic overriding triggered by other settings and an explicit
+ * choice the user can make. We don't distinguish between these cases currently. */
+
+ if (s->exec_context.std_input == EXEC_INPUT_NULL &&
+ s->exec_context.stdin_data_size > 0)
+ s->exec_context.std_input = EXEC_INPUT_DATA;
+
+ if (IN_SET(s->exec_context.std_input,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL,
+ EXEC_INPUT_SOCKET,
+ EXEC_INPUT_NAMED_FD))
+ return;
+
+ /* We assume these listed inputs refer to bidirectional streams, and hence duplicating them from
+ * stdin to stdout/stderr makes sense and hence leaving EXEC_OUTPUT_INHERIT in place makes sense,
+ * too. Outputs such as regular files or sealed data memfds otoh don't really make sense to be
+ * duplicated for both input and output at the same time (since they then would cause a feedback
+ * loop), hence override EXEC_OUTPUT_INHERIT with the default stderr/stdout setting. */
+
+ if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT &&
+ s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+ s->exec_context.std_error = UNIT(s)->manager->default_std_error;
+
+ if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+ s->exec_context.std_output = UNIT(s)->manager->default_std_output;
+}
+
+static int service_setup_bus_name(Service *s) {
+ int r;
+
+ assert(s);
+
+ /* If s->bus_name is not set, then the unit will be refused by service_verify() later. */
+ if (!s->bus_name)
+ return 0;
+
+ if (s->type == SERVICE_DBUS) {
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+
+ /* We always want to be ordered against dbus.socket if both are in the transaction. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+ }
+
+ r = unit_watch_bus_name(UNIT(s), s->bus_name);
+ if (r == -EEXIST)
+ return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name);
+
+ return 0;
+}
+
+static int service_add_extras(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (s->type == _SERVICE_TYPE_INVALID) {
+ /* Figure out a type automatically */
+ if (s->bus_name)
+ s->type = SERVICE_DBUS;
+ else if (s->exec_command[SERVICE_EXEC_START])
+ s->type = SERVICE_SIMPLE;
+ else
+ s->type = SERVICE_ONESHOT;
+ }
+
+ /* Oneshot services have disabled start timeout by default */
+ if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined)
+ s->timeout_start_usec = USEC_INFINITY;
+
+ service_fix_stdio(s);
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ /* If the service needs the notify socket, let's enable it automatically. */
+ if (s->notify_access == NOTIFY_NONE &&
+ (s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0))
+ s->notify_access = NOTIFY_MAIN;
+
+ /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when
+ * delegation is on, in that case it we assume the payload knows better what to do and can process
+ * things in a more focused way. */
+ if (s->oom_policy < 0)
+ s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy;
+
+ /* Let the kernel do the killing if that's requested. */
+ s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+ r = service_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = service_setup_bus_name(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int service_load(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = service_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return service_verify(s);
+}
+
+static void service_dump(Unit *u, FILE *f, const char *prefix) {
+ ServiceExecCommand c;
+ Service *s = SERVICE(u);
+ const char *prefix2;
+
+ assert(s);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%sService State: %s\n"
+ "%sResult: %s\n"
+ "%sReload Result: %s\n"
+ "%sClean Result: %s\n"
+ "%sPermissionsStartOnly: %s\n"
+ "%sRootDirectoryStartOnly: %s\n"
+ "%sRemainAfterExit: %s\n"
+ "%sGuessMainPID: %s\n"
+ "%sType: %s\n"
+ "%sRestart: %s\n"
+ "%sNotifyAccess: %s\n"
+ "%sNotifyState: %s\n"
+ "%sOOMPolicy: %s\n",
+ prefix, service_state_to_string(s->state),
+ prefix, service_result_to_string(s->result),
+ prefix, service_result_to_string(s->reload_result),
+ prefix, service_result_to_string(s->clean_result),
+ prefix, yes_no(s->permissions_start_only),
+ prefix, yes_no(s->root_directory_start_only),
+ prefix, yes_no(s->remain_after_exit),
+ prefix, yes_no(s->guess_main_pid),
+ prefix, service_type_to_string(s->type),
+ prefix, service_restart_to_string(s->restart),
+ prefix, notify_access_to_string(s->notify_access),
+ prefix, notify_state_to_string(s->notify_state),
+ prefix, oom_policy_to_string(s->oom_policy));
+
+ if (s->control_pid > 0)
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid);
+
+ if (s->main_pid > 0)
+ fprintf(f,
+ "%sMain PID: "PID_FMT"\n"
+ "%sMain PID Known: %s\n"
+ "%sMain PID Alien: %s\n",
+ prefix, s->main_pid,
+ prefix, yes_no(s->main_pid_known),
+ prefix, yes_no(s->main_pid_alien));
+
+ if (s->pid_file)
+ fprintf(f,
+ "%sPIDFile: %s\n",
+ prefix, s->pid_file);
+
+ if (s->bus_name)
+ fprintf(f,
+ "%sBusName: %s\n"
+ "%sBus Name Good: %s\n",
+ prefix, s->bus_name,
+ prefix, yes_no(s->bus_name_good));
+
+ if (UNIT_ISSET(s->accept_socket))
+ fprintf(f,
+ "%sAccept Socket: %s\n",
+ prefix, UNIT_DEREF(s->accept_socket)->id);
+
+ fprintf(f,
+ "%sRestartSec: %s\n"
+ "%sTimeoutStartSec: %s\n"
+ "%sTimeoutStopSec: %s\n"
+ "%sTimeoutStartFailureMode: %s\n"
+ "%sTimeoutStopFailureMode: %s\n",
+ prefix, FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->timeout_start_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->timeout_stop_usec, USEC_PER_SEC),
+ prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode),
+ prefix, service_timeout_failure_mode_to_string(s->timeout_stop_failure_mode));
+
+ if (s->timeout_abort_set)
+ fprintf(f,
+ "%sTimeoutAbortSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_abort_usec, USEC_PER_SEC));
+
+ fprintf(f,
+ "%sRuntimeMaxSec: %s\n"
+ "%sRuntimeRandomizedExtraSec: %s\n"
+ "%sWatchdogSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->watchdog_usec, USEC_PER_SEC));
+
+ kill_context_dump(&s->kill_context, f, prefix);
+ exec_context_dump(&s->exec_context, f, prefix);
+
+ for (c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) {
+
+ if (!s->exec_command[c])
+ continue;
+
+ fprintf(f, "%s-> %s:\n",
+ prefix, service_exec_command_to_string(c));
+
+ exec_command_dump_list(s->exec_command[c], f, prefix2);
+ }
+
+ if (s->status_text)
+ fprintf(f, "%sStatus Text: %s\n",
+ prefix, s->status_text);
+
+ if (s->n_fd_store_max > 0)
+ fprintf(f,
+ "%sFile Descriptor Store Max: %u\n"
+ "%sFile Descriptor Store Current: %zu\n",
+ prefix, s->n_fd_store_max,
+ prefix, s->n_fd_store);
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int service_is_suitable_main_pid(Service *s, pid_t pid, int prio) {
+ Unit *owner;
+
+ assert(s);
+ assert(pid_is_valid(pid));
+
+ /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the
+ * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is
+ * good */
+
+ if (pid == getpid_cached() || pid == 1)
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the manager, refusing.", pid);
+
+ if (pid == s->control_pid)
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the control process, refusing.", pid);
+
+ if (!pid_is_alive(pid))
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(ESRCH), "New main PID "PID_FMT" does not exist or is a zombie.", pid);
+
+ owner = manager_get_unit_by_pid(UNIT(s)->manager, pid);
+ if (owner == UNIT(s)) {
+ log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid);
+ return 1; /* Yay, it's definitely a good PID */
+ }
+
+ return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */
+}
+
+static int service_load_pid_file(Service *s, bool may_warn) {
+ bool questionable_pid_file = false;
+ _cleanup_free_ char *k = NULL;
+ _cleanup_close_ int fd = -1;
+ int r, prio;
+ pid_t pid;
+
+ assert(s);
+
+ if (!s->pid_file)
+ return -ENOENT;
+
+ prio = may_warn ? LOG_INFO : LOG_DEBUG;
+
+ r = chase_symlinks(s->pid_file, NULL, CHASE_SAFE, NULL, &fd);
+ if (r == -ENOLINK) {
+ log_unit_debug_errno(UNIT(s), r,
+ "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file);
+
+ questionable_pid_file = true;
+
+ r = chase_symlinks(s->pid_file, NULL, 0, NULL, &fd);
+ }
+ if (r < 0)
+ return log_unit_full_errno(UNIT(s), prio, r,
+ "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state));
+
+ /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd
+ * chase_symlinks() returned us into a proper fd first. */
+ r = read_one_line_file(FORMAT_PROC_FD_PATH(fd), &k);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r,
+ "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m",
+ s->pid_file);
+
+ r = parse_pid(k, &pid);
+ if (r < 0)
+ return log_unit_full_errno(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file);
+
+ if (s->main_pid_known && pid == s->main_pid)
+ return 0;
+
+ r = service_is_suitable_main_pid(s, pid, prio);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ struct stat st;
+
+ if (questionable_pid_file)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+ "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file);
+
+ /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */
+
+ if (fstat(fd, &st) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m");
+
+ if (st.st_uid != 0)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+ "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pid);
+
+ log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pid);
+ }
+
+ if (s->main_pid_known) {
+ log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid, pid);
+
+ service_unwatch_main_pid(s);
+ s->main_pid_known = false;
+ } else
+ log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pid);
+
+ r = service_set_main_pid(s, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pid(UNIT(s), pid, false);
+ if (r < 0) /* FIXME: we need to do something here */
+ return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", pid);
+
+ return 1;
+}
+
+static void service_search_main_pid(Service *s) {
+ pid_t pid = 0;
+ int r;
+
+ assert(s);
+
+ /* If we know it anyway, don't ever fall back to unreliable
+ * heuristics */
+ if (s->main_pid_known)
+ return;
+
+ if (!s->guess_main_pid)
+ return;
+
+ assert(s->main_pid <= 0);
+
+ if (unit_search_main_pid(UNIT(s), &pid) < 0)
+ return;
+
+ log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid);
+ if (service_set_main_pid(s, pid) < 0)
+ return;
+
+ r = unit_watch_pid(UNIT(s), pid, false);
+ if (r < 0)
+ /* FIXME: we need to do something here */
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", pid);
+}
+
+static void service_set_state(Service *s, ServiceState state) {
+ ServiceState old_state;
+ const UnitActiveState *table;
+
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+ old_state = s->state;
+ s->state = state;
+
+ service_unwatch_pid_file(s);
+
+ if (!IN_SET(state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_AUTO_RESTART,
+ SERVICE_CLEANING))
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ if (!IN_SET(state,
+ SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING, SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
+ service_unwatch_main_pid(s);
+ s->main_command = NULL;
+ }
+
+ if (!IN_SET(state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_CLEANING)) {
+ service_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+ }
+
+ if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) {
+ unit_unwatch_all_pids(UNIT(s));
+ unit_dequeue_rewatch_pids(UNIT(s));
+ }
+
+ if (!IN_SET(state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING, SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
+ !(state == SERVICE_DEAD && UNIT(s)->job))
+ service_close_socket_fd(s);
+
+ if (state != SERVICE_START)
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+ service_stop_watchdog(s);
+
+ /* For the inactive states unit_notify() will trim the cgroup,
+ * but for exit we have to do that ourselves... */
+ if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager))
+ unit_prune_cgroup(UNIT(s));
+
+ if (old_state != state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state));
+
+ unit_notify(UNIT(s), table[old_state], table[state],
+ (s->reload_result == SERVICE_SUCCESS ? 0 : UNIT_NOTIFY_RELOAD_FAILURE) |
+ (s->will_auto_restart ? UNIT_NOTIFY_WILL_AUTO_RESTART : 0));
+}
+
+static usec_t service_coldplug_timeout(Service *s) {
+ assert(s);
+
+ switch (s->deserialized_state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ case SERVICE_RELOAD:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec);
+
+ case SERVICE_RUNNING:
+ return service_running_timeout(s);
+
+ case SERVICE_STOP:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_FINAL_WATCHDOG:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s));
+
+ case SERVICE_AUTO_RESTART:
+ return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, s->restart_usec);
+
+ case SERVICE_CLEANING:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec);
+
+ default:
+ return USEC_INFINITY;
+ }
+}
+
+static int service_coldplug(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SERVICE_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ r = service_arm_timer(s, service_coldplug_timeout(s));
+ if (r < 0)
+ return r;
+
+ if (s->main_pid > 0 &&
+ pid_is_unwaited(s->main_pid) &&
+ (IN_SET(s->deserialized_state,
+ SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING, SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
+ r = unit_watch_pid(UNIT(s), s->main_pid, false);
+ if (r < 0)
+ return r;
+ }
+
+ if (s->control_pid > 0 &&
+ pid_is_unwaited(s->control_pid) &&
+ IN_SET(s->deserialized_state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RELOAD,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_CLEANING)) {
+ r = unit_watch_pid(UNIT(s), s->control_pid, false);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART, SERVICE_CLEANING)) {
+ (void) unit_enqueue_rewatch_pids(u);
+ (void) unit_setup_dynamic_creds(u);
+ (void) unit_setup_exec_runtime(u);
+ }
+
+ if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+ service_start_watchdog(s);
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket));
+
+ if (socket->max_connections_per_source > 0) {
+ SocketPeer *peer;
+
+ /* Make a best-effort attempt at bumping the connection count */
+ if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) {
+ socket_peer_unref(s->socket_peer);
+ s->socket_peer = peer;
+ }
+ }
+ }
+
+ service_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static int service_collect_fds(
+ Service *s,
+ int **fds,
+ char ***fd_names,
+ size_t *n_socket_fds,
+ size_t *n_storage_fds) {
+
+ _cleanup_strv_free_ char **rfd_names = NULL;
+ _cleanup_free_ int *rfds = NULL;
+ size_t rn_socket_fds = 0, rn_storage_fds = 0;
+ int r;
+
+ assert(s);
+ assert(fds);
+ assert(fd_names);
+ assert(n_socket_fds);
+ assert(n_storage_fds);
+
+ if (s->socket_fd >= 0) {
+
+ /* Pass the per-connection socket */
+
+ rfds = new(int, 1);
+ if (!rfds)
+ return -ENOMEM;
+ rfds[0] = s->socket_fd;
+
+ rfd_names = strv_new("connection");
+ if (!rfd_names)
+ return -ENOMEM;
+
+ rn_socket_fds = 1;
+ } else {
+ Unit *u;
+
+ /* Pass all our configured sockets for singleton services */
+
+ UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) {
+ _cleanup_free_ int *cfds = NULL;
+ Socket *sock;
+ int cn_fds;
+
+ if (u->type != UNIT_SOCKET)
+ continue;
+
+ sock = SOCKET(u);
+
+ cn_fds = socket_collect_fds(sock, &cfds);
+ if (cn_fds < 0)
+ return cn_fds;
+
+ if (cn_fds <= 0)
+ continue;
+
+ if (!rfds) {
+ rfds = TAKE_PTR(cfds);
+ rn_socket_fds = cn_fds;
+ } else {
+ int *t;
+
+ t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int));
+ if (!t)
+ return -ENOMEM;
+
+ memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int));
+
+ rfds = t;
+ rn_socket_fds += cn_fds;
+ }
+
+ r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (s->n_fd_store > 0) {
+ size_t n_fds;
+ char **nl;
+ int *t;
+
+ t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int));
+ if (!t)
+ return -ENOMEM;
+
+ rfds = t;
+
+ nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *));
+ if (!nl)
+ return -ENOMEM;
+
+ rfd_names = nl;
+ n_fds = rn_socket_fds;
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ rfds[n_fds] = fs->fd;
+ rfd_names[n_fds] = strdup(strempty(fs->fdname));
+ if (!rfd_names[n_fds])
+ return -ENOMEM;
+
+ rn_storage_fds++;
+ n_fds++;
+ }
+
+ rfd_names[n_fds] = NULL;
+ }
+
+ *fds = TAKE_PTR(rfds);
+ *fd_names = TAKE_PTR(rfd_names);
+ *n_socket_fds = rn_socket_fds;
+ *n_storage_fds = rn_storage_fds;
+
+ return 0;
+}
+
+static int service_allocate_exec_fd_event_source(
+ Service *s,
+ int fd,
+ sd_event_source **ret_event_source) {
+
+ _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+ assert(ret_event_source);
+
+ r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
+
+ /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
+
+ r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
+
+ (void) sd_event_source_set_description(source, "service exec_fd");
+
+ r = sd_event_source_set_io_fd_own(source, true);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m");
+
+ *ret_event_source = TAKE_PTR(source);
+ return 0;
+}
+
+static int service_allocate_exec_fd(
+ Service *s,
+ sd_event_source **ret_event_source,
+ int *ret_exec_fd) {
+
+ _cleanup_close_pair_ int p[] = { -1, -1 };
+ int r;
+
+ assert(s);
+ assert(ret_event_source);
+ assert(ret_exec_fd);
+
+ if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m");
+
+ r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source);
+ if (r < 0)
+ return r;
+
+ TAKE_FD(p[0]);
+ *ret_exec_fd = TAKE_FD(p[1]);
+
+ return 0;
+}
+
+static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) {
+ assert(s);
+
+ /* Notifications are accepted depending on the process and
+ * the access setting of the service:
+ * process: \ access: NONE MAIN EXEC ALL
+ * main no yes yes yes
+ * control no no yes yes
+ * other (forked) no no no yes */
+
+ if (flags & EXEC_IS_CONTROL)
+ /* A control process */
+ return IN_SET(s->notify_access, NOTIFY_EXEC, NOTIFY_ALL);
+
+ /* We only spawn main processes and control processes, so any
+ * process that is not a control process is a main process */
+ return s->notify_access != NOTIFY_NONE;
+}
+
+static Service *service_get_triggering_service(Service *s) {
+ Unit *candidate = NULL, *other;
+
+ assert(s);
+
+ /* Return the service which triggered service 's', this means dependency
+ * types which include the UNIT_ATOM_ON_{FAILURE,SUCCESS}_OF atoms.
+ *
+ * N.B. if there are multiple services which could trigger 's' via OnFailure=
+ * or OnSuccess= then we return NULL. This is since we don't know from which
+ * one to propagate the exit status. */
+
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_FAILURE_OF) {
+ if (candidate)
+ goto have_other;
+ candidate = other;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_SUCCESS_OF) {
+ if (candidate)
+ goto have_other;
+ candidate = other;
+ }
+
+ return SERVICE(candidate);
+
+ have_other:
+ log_unit_warning(UNIT(s), "multiple trigger source candidates for exit status propagation (%s, %s), skipping.",
+ candidate->id, other->id);
+ return NULL;
+}
+
+static int service_spawn_internal(
+ const char *caller,
+ Service *s,
+ ExecCommand *c,
+ usec_t timeout,
+ ExecFlags flags,
+ pid_t *ret_pid) {
+
+ _cleanup_(exec_params_clear) ExecParameters exec_params = {
+ .flags = flags,
+ .stdin_fd = -1,
+ .stdout_fd = -1,
+ .stderr_fd = -1,
+ .exec_fd = -1,
+ };
+ _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
+ _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
+ size_t n_env = 0;
+ pid_t pid;
+ int r;
+
+ assert(caller);
+ assert(s);
+ assert(c);
+ assert(ret_pid);
+
+ log_unit_debug(UNIT(s), "Will spawn child (%s): %s", caller, c->path);
+
+ r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */
+ if (r < 0)
+ return r;
+
+ assert(!s->exec_fd_event_source);
+
+ if (flags & EXEC_IS_CONTROL) {
+ /* If this is a control process, mask the permissions/chroot application if this is requested. */
+ if (s->permissions_start_only)
+ exec_params.flags &= ~EXEC_APPLY_SANDBOXING;
+ if (s->root_directory_start_only)
+ exec_params.flags &= ~EXEC_APPLY_CHROOT;
+ }
+
+ if ((flags & EXEC_PASS_FDS) ||
+ s->exec_context.std_input == EXEC_INPUT_SOCKET ||
+ s->exec_context.std_output == EXEC_OUTPUT_SOCKET ||
+ s->exec_context.std_error == EXEC_OUTPUT_SOCKET) {
+
+ r = service_collect_fds(s,
+ &exec_params.fds,
+ &exec_params.fd_names,
+ &exec_params.n_socket_fds,
+ &exec_params.n_storage_fds);
+ if (r < 0)
+ return r;
+
+ log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
+ }
+
+ if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+ r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
+ if (r < 0)
+ return r;
+ }
+
+ r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout));
+ if (r < 0)
+ return r;
+
+ our_env = new0(char*, 12);
+ if (!our_env)
+ return -ENOMEM;
+
+ if (service_exec_needs_notify_socket(s, flags)) {
+ if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0)
+ return -ENOMEM;
+
+ exec_params.notify_socket = UNIT(s)->manager->notify_socket;
+ }
+
+ if (s->main_pid > 0)
+ if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid) < 0)
+ return -ENOMEM;
+
+ if (MANAGER_IS_USER(UNIT(s)->manager))
+ if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+
+ if (s->pid_file)
+ if (asprintf(our_env + n_env++, "PIDFILE=%s", s->pid_file) < 0)
+ return -ENOMEM;
+
+ if (s->socket_fd >= 0) {
+ union sockaddr_union sa;
+ socklen_t salen = sizeof(sa);
+
+ /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something
+ * useful. Note that we do this only when we are still connected at this point in time, which we might
+ * very well not be. Hence we ignore all errors when retrieving peer information (as that might result
+ * in ENOTCONN), and just use whate we can use. */
+
+ if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 &&
+ IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+ _cleanup_free_ char *addr = NULL;
+ char *t;
+ unsigned port;
+
+ r = sockaddr_pretty(&sa.sa, salen, true, false, &addr);
+ if (r < 0)
+ return r;
+
+ t = strjoin("REMOTE_ADDR=", addr);
+ if (!t)
+ return -ENOMEM;
+ our_env[n_env++] = t;
+
+ r = sockaddr_port(&sa.sa, &port);
+ if (r < 0)
+ return r;
+
+ if (asprintf(&t, "REMOTE_PORT=%u", port) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = t;
+ }
+ }
+
+ Service *env_source = NULL;
+ const char *monitor_prefix;
+ if (flags & EXEC_SETENV_RESULT) {
+ env_source = s;
+ monitor_prefix = "";
+ } else if (flags & EXEC_SETENV_MONITOR_RESULT) {
+ env_source = service_get_triggering_service(s);
+ monitor_prefix = "MONITOR_";
+ }
+
+ if (env_source) {
+ if (asprintf(our_env + n_env++, "%sSERVICE_RESULT=%s", monitor_prefix, service_result_to_string(env_source->result)) < 0)
+ return -ENOMEM;
+
+ if (env_source->main_exec_status.pid > 0 &&
+ dual_timestamp_is_set(&env_source->main_exec_status.exit_timestamp)) {
+ if (asprintf(our_env + n_env++, "%sEXIT_CODE=%s", monitor_prefix, sigchld_code_to_string(env_source->main_exec_status.code)) < 0)
+ return -ENOMEM;
+
+ if (env_source->main_exec_status.code == CLD_EXITED)
+ r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status);
+ else
+ r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status));
+
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ if (env_source != s) {
+ if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) {
+ r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR,
+ monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id));
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0)
+ return -ENOMEM;
+ }
+ }
+
+ if (UNIT(s)->activation_details) {
+ r = activation_details_append_env(UNIT(s)->activation_details, &our_env);
+ if (r < 0)
+ return r;
+ /* The number of env vars added here can vary, rather than keeping the allocation block in
+ * sync manually, these functions simply use the strv methods to append to it, so we need
+ * to update n_env when we are done in case of future usage. */
+ n_env += r;
+ }
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ return r;
+
+ final_env = strv_env_merge(exec_params.environment, our_env);
+ if (!final_env)
+ return -ENOMEM;
+
+ /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */
+ SET_FLAG(exec_params.flags, EXEC_NSS_DYNAMIC_BYPASS,
+ MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE));
+
+ strv_free_and_replace(exec_params.environment, final_env);
+ exec_params.watchdog_usec = service_get_watchdog_usec(s);
+ exec_params.selinux_context_net = s->socket_fd_selinux_context_net;
+ if (s->type == SERVICE_IDLE)
+ exec_params.idle_pipe = UNIT(s)->manager->idle_pipe;
+ exec_params.stdin_fd = s->stdin_fd;
+ exec_params.stdout_fd = s->stdout_fd;
+ exec_params.stderr_fd = s->stderr_fd;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->dynamic_creds,
+ &pid);
+ if (r < 0)
+ return r;
+
+ s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
+ s->exec_fd_hot = false;
+
+ r = unit_watch_pid(UNIT(s), pid, true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = pid;
+
+ return 0;
+}
+
+static int main_pid_good(Service *s) {
+ assert(s);
+
+ /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */
+
+ /* If we know the pid file, then let's just check if it is
+ * still valid */
+ if (s->main_pid_known) {
+
+ /* If it's an alien child let's check if it is still
+ * alive ... */
+ if (s->main_pid_alien && s->main_pid > 0)
+ return pid_is_alive(s->main_pid);
+
+ /* .. otherwise assume we'll get a SIGCHLD for it,
+ * which we really should wait for to collect exit
+ * status and code */
+ return s->main_pid > 0;
+ }
+
+ /* We don't know the pid */
+ return -EAGAIN;
+}
+
+static int control_pid_good(Service *s) {
+ assert(s);
+
+ /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to
+ * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also
+ * means: we can't figure it out. */
+
+ return s->control_pid > 0;
+}
+
+static int cgroup_good(Service *s) {
+ int r;
+
+ assert(s);
+
+ /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't
+ * figure it out */
+
+ if (!UNIT(s)->cgroup_path)
+ return 0;
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path);
+ if (r < 0)
+ return r;
+
+ return r == 0;
+}
+
+static bool service_shall_restart(Service *s, const char **reason) {
+ assert(s);
+
+ /* Don't restart after manual stops */
+ if (s->forbid_restart) {
+ *reason = "manual stop";
+ return false;
+ }
+
+ /* Never restart if this is configured as special exception */
+ if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) {
+ *reason = "prevented by exit status";
+ return false;
+ }
+
+ /* Restart if the exit code/status are configured as restart triggers */
+ if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) {
+ *reason = "forced by exit status";
+ return true;
+ }
+
+ *reason = "restart setting";
+ switch (s->restart) {
+
+ case SERVICE_RESTART_NO:
+ return false;
+
+ case SERVICE_RESTART_ALWAYS:
+ return s->result != SERVICE_SKIP_CONDITION;
+
+ case SERVICE_RESTART_ON_SUCCESS:
+ return s->result == SERVICE_SUCCESS;
+
+ case SERVICE_RESTART_ON_FAILURE:
+ return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_SKIP_CONDITION);
+
+ case SERVICE_RESTART_ON_ABNORMAL:
+ return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE, SERVICE_SKIP_CONDITION);
+
+ case SERVICE_RESTART_ON_WATCHDOG:
+ return s->result == SERVICE_FAILURE_WATCHDOG;
+
+ case SERVICE_RESTART_ON_ABORT:
+ return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static bool service_will_restart(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ if (s->will_auto_restart)
+ return true;
+ if (s->state == SERVICE_AUTO_RESTART)
+ return true;
+
+ return unit_will_restart_default(u);
+}
+
+static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) {
+ ServiceState end_state;
+ int r;
+
+ assert(s);
+
+ /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not
+ * undo what has already been enqueued. */
+ if (unit_stop_pending(UNIT(s)))
+ allow_restart = false;
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->result == SERVICE_SUCCESS) {
+ unit_log_success(UNIT(s));
+ end_state = SERVICE_DEAD;
+ } else if (s->result == SERVICE_SKIP_CONDITION) {
+ unit_log_skip(UNIT(s), service_result_to_string(s->result));
+ end_state = SERVICE_DEAD;
+ } else {
+ unit_log_failure(UNIT(s), service_result_to_string(s->result));
+ end_state = SERVICE_FAILED;
+ }
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+ if (!allow_restart)
+ log_unit_debug(UNIT(s), "Service restart not allowed.");
+ else {
+ const char *reason;
+ bool shall_restart;
+
+ shall_restart = service_shall_restart(s, &reason);
+ log_unit_debug(UNIT(s), "Service will %srestart (%s)",
+ shall_restart ? "" : "not ",
+ reason);
+ if (shall_restart)
+ s->will_auto_restart = true;
+ }
+
+ /* Make sure service_release_resources() doesn't destroy our FD store, while we are changing through
+ * SERVICE_FAILED/SERVICE_DEAD before entering into SERVICE_AUTO_RESTART. */
+ s->n_keep_fd_store ++;
+
+ service_set_state(s, end_state);
+
+ if (s->will_auto_restart) {
+ s->will_auto_restart = false;
+
+ r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->restart_usec));
+ if (r < 0) {
+ s->n_keep_fd_store--;
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_AUTO_RESTART);
+ } else
+ /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the
+ * user can still introspect the counter. Do so on the next start. */
+ s->flush_n_restarts = true;
+
+ /* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC
+ * queue, so that the fd store is possibly gc'ed again */
+ s->n_keep_fd_store--;
+ unit_add_to_gc_queue(UNIT(s));
+
+ /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */
+ s->forbid_restart = false;
+
+ /* We want fresh tmpdirs in case service is started again immediately */
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, true);
+
+ /* Also, remove the runtime directory */
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ /* Get rid of the IPC bits of the user */
+ unit_unref_uid_gid(UNIT(s), true);
+
+ /* Release the user, and destroy it if we are the only remaining owner */
+ dynamic_creds_destroy(&s->dynamic_creds);
+
+ /* Try to delete the pid file. At this point it will be
+ * out-of-date, and some software might be confused by it, so
+ * let's remove it. */
+ if (s->pid_file)
+ (void) unlink(s->pid_file);
+
+ /* Reset TTY ownership if necessary */
+ exec_context_revert_tty(&s->exec_context);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run install restart timer: %m");
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false);
+}
+
+static void service_enter_stop_post(Service *s, ServiceResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_STOP_POST;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_stop_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_STOP_POST);
+ } else
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m");
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
+}
+
+static int state_to_kill_operation(Service *s, ServiceState state) {
+ switch (state) {
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_FINAL_WATCHDOG:
+ return KILL_WATCHDOG;
+
+ case SERVICE_STOP_SIGTERM:
+ if (unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+ _fallthrough_;
+
+ case SERVICE_FINAL_SIGTERM:
+ return KILL_TERMINATE;
+
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_FINAL_SIGKILL:
+ return KILL_KILL;
+
+ default:
+ return _KILL_OPERATION_INVALID;
+ }
+}
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) {
+ int kill_operation, r;
+
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ /* Before sending any signal, make sure we track all members of this cgroup */
+ (void) unit_watch_all_pids(UNIT(s));
+
+ /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+ * died now */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ kill_operation = state_to_kill_operation(s, state);
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ kill_operation,
+ s->main_pid,
+ s->control_pid,
+ s->main_pid_alien);
+ if (r < 0)
+ goto fail;
+
+ if (r > 0) {
+ r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC),
+ kill_operation == KILL_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec));
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, state);
+ } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
+ else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+ service_enter_stop_post(s, SERVICE_SUCCESS);
+ else if (IN_SET(state, SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM) && s->kill_context.send_sigkill)
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+ else
+ service_enter_dead(s, SERVICE_SUCCESS, true);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+
+ if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+ service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
+ else
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
+}
+
+static void service_enter_stop_by_notify(Service *s) {
+ assert(s);
+
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
+
+ /* The service told us it's stopping, so it's as if we SIGTERM'd it. */
+ service_set_state(s, SERVICE_STOP_SIGTERM);
+}
+
+static void service_enter_stop(Service *s, ServiceResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ s->control_command = s->exec_command[SERVICE_EXEC_STOP];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_STOP;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_stop_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_STOP);
+ } else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop' task: %m");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+}
+
+static bool service_good(Service *s) {
+ int main_pid_ok;
+ assert(s);
+
+ if (s->type == SERVICE_DBUS && !s->bus_name_good)
+ return false;
+
+ main_pid_ok = main_pid_good(s);
+ if (main_pid_ok > 0) /* It's alive */
+ return true;
+ if (main_pid_ok == 0 && s->exit_type == SERVICE_EXIT_MAIN) /* It's dead */
+ return false;
+
+ /* OK, we don't know anything about the main PID, maybe
+ * because there is none. Let's check the control group
+ * instead. */
+
+ return cgroup_good(s) != 0;
+}
+
+static void service_enter_running(Service *s, ServiceResult f) {
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+
+ if (s->result != SERVICE_SUCCESS)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ else if (service_good(s)) {
+
+ /* If there are any queued up sd_notify() notifications, process them now */
+ if (s->notify_state == NOTIFY_RELOADING)
+ service_enter_reload_by_notify(s);
+ else if (s->notify_state == NOTIFY_STOPPING)
+ service_enter_stop_by_notify(s);
+ else {
+ service_set_state(s, SERVICE_RUNNING);
+ service_arm_timer(s, service_running_timeout(s));
+ }
+
+ } else if (s->remain_after_exit)
+ service_set_state(s, SERVICE_EXITED);
+ else
+ service_enter_stop(s, SERVICE_SUCCESS);
+}
+
+static void service_enter_start_post(Service *s) {
+ int r;
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ service_reset_watchdog(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_START_POST];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_START_POST;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_START_POST);
+ } else
+ service_enter_running(s, SERVICE_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m");
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+}
+
+static void service_kill_control_process(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (s->control_pid <= 0)
+ return;
+
+ r = kill_and_sigcont(s->control_pid, SIGKILL);
+ if (r < 0) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) get_process_comm(s->control_pid, &comm);
+
+ log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m",
+ s->control_pid, strna(comm));
+ }
+}
+
+static int service_adverse_to_leftover_processes(Service *s) {
+ assert(s);
+
+ /* KillMode=mixed and control group are used to indicate that all process should be killed off.
+ * SendSIGKILL= is used for services that require a clean shutdown. These are typically database
+ * service where a SigKilled process would result in a lengthy recovery and who's shutdown or startup
+ * time is quite variable (so Timeout settings aren't of use).
+ *
+ * Here we take these two factors and refuse to start a service if there are existing processes
+ * within a control group. Databases, while generally having some protection against multiple
+ * instances running, lets not stress the rigor of these. Also ExecStartPre= parts of the service
+ * aren't as rigoriously written to protect aganst against multiple use. */
+
+ if (unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start) > 0 &&
+ IN_SET(s->kill_context.kill_mode, KILL_MIXED, KILL_CONTROL_GROUP) &&
+ !s->kill_context.send_sigkill)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EBUSY),
+ "Will not start SendSIGKILL=no service of type KillMode=control-group or mixed while processes exist");
+
+ return 0;
+}
+
+static void service_enter_start(Service *s) {
+ ExecCommand *c;
+ usec_t timeout;
+ pid_t pid;
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ service_unwatch_main_pid(s);
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ if (s->type == SERVICE_FORKING) {
+ s->control_command_id = SERVICE_EXEC_START;
+ c = s->control_command = s->exec_command[SERVICE_EXEC_START];
+
+ s->main_command = NULL;
+ } else {
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+ s->control_command = NULL;
+
+ c = s->main_command = s->exec_command[SERVICE_EXEC_START];
+ }
+
+ if (!c) {
+ if (s->type != SERVICE_ONESHOT) {
+ /* There's no command line configured for the main command? Hmm, that is strange.
+ * This can only happen if the configuration changes at runtime. In this case,
+ * let's enter a failure state. */
+ r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENXIO), "There's no 'start' task anymore we could start.");
+ goto fail;
+ }
+
+ /* We force a fake state transition here. Otherwise, the unit would go directly from
+ * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE
+ * in between. This way we can later trigger actions that depend on the state
+ * transition, including SuccessAction=. */
+ service_set_state(s, SERVICE_START);
+
+ service_enter_start_post(s);
+ return;
+ }
+
+ if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE))
+ /* For simple + idle this is the main process. We don't apply any timeout here, but
+ * service_enter_running() will later apply the .runtime_max_usec timeout. */
+ timeout = USEC_INFINITY;
+ else
+ timeout = s->timeout_start_usec;
+
+ r = service_spawn(s,
+ c,
+ timeout,
+ EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT,
+ &pid);
+ if (r < 0)
+ goto fail;
+
+ if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) {
+ /* For simple services we immediately start
+ * the START_POST binaries. */
+
+ (void) service_set_main_pid(s, pid);
+ service_enter_start_post(s);
+
+ } else if (s->type == SERVICE_FORKING) {
+
+ /* For forking services we wait until the start
+ * process exited. */
+
+ s->control_pid = pid;
+ service_set_state(s, SERVICE_START);
+
+ } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) {
+
+ /* For oneshot services we wait until the start process exited, too, but it is our main process. */
+
+ /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
+ * bus. 'notify' and 'exec' services are similar. */
+
+ (void) service_set_main_pid(s, pid);
+ service_set_state(s, SERVICE_START);
+ } else
+ assert_not_reached();
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'start' task: %m");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+}
+
+static void service_enter_start_pre(Service *s) {
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_START_PRE];
+ if (s->control_command) {
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ s->control_command_id = SERVICE_EXEC_START_PRE;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_START_PRE);
+ } else
+ service_enter_start(s);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m");
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
+}
+
+static void service_enter_condition(Service *s) {
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_CONDITION];
+ if (s->control_command) {
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ s->control_command_id = SERVICE_EXEC_CONDITION;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN,
+ &s->control_pid);
+
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_CONDITION);
+ } else
+ service_enter_start_pre(s);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'exec-condition' task: %m");
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
+}
+
+static void service_enter_restart(Service *s) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ if (unit_has_job_type(UNIT(s), JOB_STOP)) {
+ /* Don't restart things if we are going down anyway */
+ log_unit_info(UNIT(s), "Stop job pending for unit, skipping automatic restart.");
+ return;
+ }
+
+ /* Any units that are bound to this service must also be
+ * restarted. We use JOB_RESTART (instead of the more obvious
+ * JOB_START) here so that those dependency jobs will be added
+ * as well. */
+ r = manager_add_job(UNIT(s)->manager, JOB_RESTART, UNIT(s), JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ goto fail;
+
+ /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't fully
+ * stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset the counter
+ * explicitly however via the usual "systemctl reset-failure" logic. */
+ s->n_restarts ++;
+ s->flush_n_restarts = false;
+
+ log_unit_struct(UNIT(s), LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR,
+ LOG_UNIT_INVOCATION_ID(UNIT(s)),
+ LOG_UNIT_MESSAGE(UNIT(s),
+ "Scheduled restart job, restart counter is at %u.", s->n_restarts),
+ "N_RESTARTS=%u", s->n_restarts);
+
+ /* Notify clients about changed restart counter */
+ unit_add_to_dbus_queue(UNIT(s));
+
+ /* Note that we stay in the SERVICE_AUTO_RESTART state here,
+ * it will be canceled as part of the service_stop() call that
+ * is executed as part of JOB_RESTART. */
+
+ return;
+
+fail:
+ log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, r));
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, false);
+}
+
+static void service_enter_reload_by_notify(Service *s) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_start_usec));
+ service_set_state(s, SERVICE_RELOAD);
+
+ /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */
+ r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error);
+ if (r < 0)
+ log_unit_warning(UNIT(s), "Failed to schedule propagation of reload: %s", bus_error_message(&error, r));
+}
+
+static void service_enter_reload(Service *s) {
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ s->reload_result = SERVICE_SUCCESS;
+
+ s->control_command = s->exec_command[SERVICE_EXEC_RELOAD];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_RELOAD;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_RELOAD);
+ } else
+ service_enter_running(s, SERVICE_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'reload' task: %m");
+ s->reload_result = SERVICE_FAILURE_RESOURCES;
+ service_enter_running(s, SERVICE_SUCCESS);
+}
+
+static void service_run_next_control(Service *s) {
+ usec_t timeout;
+ int r;
+
+ assert(s);
+ assert(s->control_command);
+ assert(s->control_command->command_next);
+
+ assert(s->control_command_id != SERVICE_EXEC_START);
+
+ s->control_command = s->control_command->command_next;
+ service_unwatch_control_pid(s);
+
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+ timeout = s->timeout_start_usec;
+ else
+ timeout = s->timeout_stop_usec;
+
+ r = service_spawn(s,
+ s->control_command,
+ timeout,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|
+ (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0),
+ &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run next control task: %m");
+
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP))
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ else if (s->state == SERVICE_STOP_POST)
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
+ else if (s->state == SERVICE_RELOAD) {
+ s->reload_result = SERVICE_FAILURE_RESOURCES;
+ service_enter_running(s, SERVICE_SUCCESS);
+ } else
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+}
+
+static void service_run_next_main(Service *s) {
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(s->main_command);
+ assert(s->main_command->command_next);
+ assert(s->type == SERVICE_ONESHOT);
+
+ s->main_command = s->main_command->command_next;
+ service_unwatch_main_pid(s);
+
+ r = service_spawn(s,
+ s->main_command,
+ s->timeout_start_usec,
+ EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+ &pid);
+ if (r < 0)
+ goto fail;
+
+ (void) service_set_main_pid(s, pid);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run next main task: %m");
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+}
+
+static int service_start(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(s->state,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST))
+ return 0;
+
+ /* A service that will be restarted must be stopped first to
+ * trigger BindsTo and/or OnFailure dependencies. If a user
+ * does not want to wait for the holdoff time to elapse, the
+ * service should be manually restarted, not started. We
+ * simply return EAGAIN here, so that any start jobs stay
+ * queued, and assume that the auto restart timer will
+ * eventually trigger the restart. */
+ if (s->state == SERVICE_AUTO_RESTART)
+ return -EAGAIN;
+
+ assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ s->result = SERVICE_SUCCESS;
+ s->reload_result = SERVICE_SUCCESS;
+ s->main_pid_known = false;
+ s->main_pid_alien = false;
+ s->forbid_restart = false;
+
+ s->status_text = mfree(s->status_text);
+ s->status_errno = 0;
+
+ s->notify_state = NOTIFY_UNKNOWN;
+
+ s->watchdog_original_usec = s->watchdog_usec;
+ s->watchdog_override_enable = false;
+ s->watchdog_override_usec = USEC_INFINITY;
+
+ exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+ exec_status_reset(&s->main_exec_status);
+
+ /* This is not an automatic restart? Flush the restart counter then */
+ if (s->flush_n_restarts) {
+ s->n_restarts = 0;
+ s->flush_n_restarts = false;
+ }
+
+ u->reset_accounting = true;
+
+ service_enter_condition(s);
+ return 1;
+}
+
+static int service_stop(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* Don't create restart jobs from manual stops. */
+ s->forbid_restart = true;
+
+ /* Already on it */
+ if (IN_SET(s->state,
+ SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))
+ return 0;
+
+ /* A restart will be scheduled or is in progress. */
+ if (s->state == SERVICE_AUTO_RESTART) {
+ service_set_state(s, SERVICE_DEAD);
+ return 0;
+ }
+
+ /* If there's already something running we go directly into
+ * kill mode. */
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RELOAD, SERVICE_STOP_WATCHDOG)) {
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+ return 0;
+ }
+
+ /* If we are currently cleaning, then abort it, brutally. */
+ if (s->state == SERVICE_CLEANING) {
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+ return 0;
+ }
+
+ assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED));
+
+ service_enter_stop(s, SERVICE_SUCCESS);
+ return 1;
+}
+
+static int service_reload(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED));
+
+ service_enter_reload(s);
+ return 1;
+}
+
+_pure_ static bool service_can_reload(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return !!s->exec_command[SERVICE_EXEC_RELOAD];
+}
+
+static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const ExecCommand *current) {
+ Service *s = SERVICE(u);
+ unsigned idx = 0;
+
+ assert(s);
+ assert(id >= 0);
+ assert(id < _SERVICE_EXEC_COMMAND_MAX);
+
+ const ExecCommand *first = s->exec_command[id];
+
+ /* Figure out where we are in the list by walking back to the beginning */
+ for (const ExecCommand *c = current; c != first; c = c->command_prev)
+ idx++;
+
+ return idx;
+}
+
+static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) {
+ _cleanup_free_ char *args = NULL, *p = NULL;
+ Service *s = SERVICE(u);
+ const char *type, *key;
+ ServiceExecCommand id;
+ size_t length = 0;
+ unsigned idx;
+
+ assert(s);
+ assert(f);
+
+ if (!command)
+ return 0;
+
+ if (command == s->control_command) {
+ type = "control";
+ id = s->control_command_id;
+ } else {
+ type = "main";
+ id = SERVICE_EXEC_START;
+ }
+
+ idx = service_exec_command_index(u, id, command);
+
+ STRV_FOREACH(arg, command->argv) {
+ _cleanup_free_ char *e = NULL;
+ size_t n;
+
+ e = cescape(*arg);
+ if (!e)
+ return log_oom();
+
+ n = strlen(e);
+ if (!GREEDY_REALLOC(args, length + 2 + n + 2))
+ return log_oom();
+
+ if (length > 0)
+ args[length++] = ' ';
+
+ args[length++] = '"';
+ memcpy(args + length, e, n);
+ length += n;
+ args[length++] = '"';
+ }
+
+ if (!GREEDY_REALLOC(args, length + 1))
+ return log_oom();
+
+ args[length++] = 0;
+
+ p = cescape(command->path);
+ if (!p)
+ return log_oom();
+
+ key = strjoina(type, "-command");
+
+ /* We use '+1234' instead of '1234' to mark the last command in a sequence.
+ * This is used in service_deserialize_exec_command(). */
+ (void) serialize_item_format(
+ f, key,
+ "%s %s%u %s %s",
+ service_exec_command_to_string(id),
+ command->command_next ? "" : "+",
+ idx,
+ p, args);
+
+ return 0;
+}
+
+static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", service_state_to_string(s->state));
+ (void) serialize_item(f, "result", service_result_to_string(s->result));
+ (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result));
+
+ if (s->control_pid > 0)
+ (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid);
+
+ if (s->main_pid_known && s->main_pid > 0)
+ (void) serialize_item_format(f, "main-pid", PID_FMT, s->main_pid);
+
+ (void) serialize_bool(f, "main-pid-known", s->main_pid_known);
+ (void) serialize_bool(f, "bus-name-good", s->bus_name_good);
+ (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner);
+
+ (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts);
+ (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts);
+
+ r = serialize_item_escaped(f, "status-text", s->status_text);
+ if (r < 0)
+ return r;
+
+ service_serialize_exec_command(u, f, s->control_command);
+ service_serialize_exec_command(u, f, s->main_command);
+
+ r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd);
+ if (r < 0)
+ return r;
+ r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd);
+ if (r < 0)
+ return r;
+ r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd);
+ if (r < 0)
+ return r;
+
+ if (s->exec_fd_event_source) {
+ r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source));
+ if (r < 0)
+ return r;
+
+ (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot);
+ }
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_fd(f, fds, "socket-fd", s->socket_fd);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ _cleanup_free_ char *c = NULL;
+ int copy;
+
+ copy = fdset_put_dup(fds, fs->fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m");
+
+ c = cescape(fs->fdname);
+ if (!c)
+ return log_oom();
+
+ (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll);
+ }
+
+ if (s->main_exec_status.pid > 0) {
+ (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
+ (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
+ (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+
+ if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
+ (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
+ (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status);
+ }
+ }
+
+ (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp);
+ (void) serialize_bool(f, "forbid-restart", s->forbid_restart);
+
+ if (s->watchdog_override_enable)
+ (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec);
+
+ if (s->watchdog_original_usec != USEC_INFINITY)
+ (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec);
+
+ return 0;
+}
+
+int service_deserialize_exec_command(
+ Unit *u,
+ const char *key,
+ const char *value) {
+
+ Service *s = SERVICE(u);
+ int r;
+ unsigned idx = 0, i;
+ bool control, found = false, last = false;
+ ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID;
+ ExecCommand *command = NULL;
+ _cleanup_free_ char *path = NULL;
+ _cleanup_strv_free_ char **argv = NULL;
+
+ enum ExecCommandState {
+ STATE_EXEC_COMMAND_TYPE,
+ STATE_EXEC_COMMAND_INDEX,
+ STATE_EXEC_COMMAND_PATH,
+ STATE_EXEC_COMMAND_ARGS,
+ _STATE_EXEC_COMMAND_MAX,
+ _STATE_EXEC_COMMAND_INVALID = -EINVAL,
+ } state;
+
+ assert(s);
+ assert(key);
+ assert(value);
+
+ control = streq(key, "control-command");
+
+ state = STATE_EXEC_COMMAND_TYPE;
+
+ for (;;) {
+ _cleanup_free_ char *arg = NULL;
+
+ r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ switch (state) {
+ case STATE_EXEC_COMMAND_TYPE:
+ id = service_exec_command_from_string(arg);
+ if (id < 0)
+ return id;
+
+ state = STATE_EXEC_COMMAND_INDEX;
+ break;
+ case STATE_EXEC_COMMAND_INDEX:
+ /* PID 1234 is serialized as either '1234' or '+1234'. The second form is used to
+ * mark the last command in a sequence. We warn if the deserialized command doesn't
+ * match what we have loaded from the unit, but we don't need to warn if that is the
+ * last command. */
+
+ r = safe_atou(arg, &idx);
+ if (r < 0)
+ return r;
+ last = arg[0] == '+';
+
+ state = STATE_EXEC_COMMAND_PATH;
+ break;
+ case STATE_EXEC_COMMAND_PATH:
+ path = TAKE_PTR(arg);
+ state = STATE_EXEC_COMMAND_ARGS;
+ break;
+ case STATE_EXEC_COMMAND_ARGS:
+ r = strv_extend(&argv, arg);
+ if (r < 0)
+ return -ENOMEM;
+ break;
+ default:
+ assert_not_reached();
+ }
+ }
+
+ if (state != STATE_EXEC_COMMAND_ARGS)
+ return -EINVAL;
+ if (strv_isempty(argv))
+ return -EINVAL; /* At least argv[0] must be always present. */
+
+ /* Let's check whether exec command on given offset matches data that we just deserialized */
+ for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) {
+ if (i != idx)
+ continue;
+
+ found = strv_equal(argv, command->argv) && streq(command->path, path);
+ break;
+ }
+
+ if (!found) {
+ /* Command at the index we serialized is different, let's look for command that exactly
+ * matches but is on different index. If there is no such command we will not resume execution. */
+ for (command = s->exec_command[id]; command; command = command->command_next)
+ if (strv_equal(command->argv, argv) && streq(command->path, path))
+ break;
+ }
+
+ if (command && control) {
+ s->control_command = command;
+ s->control_command_id = id;
+ } else if (command)
+ s->main_command = command;
+ else if (last)
+ log_unit_debug(u, "Current command vanished from the unit file.");
+ else
+ log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed.");
+
+ return 0;
+}
+
+static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ ServiceState state;
+
+ state = service_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ ServiceResult f;
+
+ f = service_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SERVICE_SUCCESS)
+ s->result = f;
+
+ } else if (streq(key, "reload-result")) {
+ ServiceResult f;
+
+ f = service_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse reload result value: %s", value);
+ else if (f != SERVICE_SUCCESS)
+ s->reload_result = f;
+
+ } else if (streq(key, "control-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse control-pid value: %s", value);
+ else
+ s->control_pid = pid;
+ } else if (streq(key, "main-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse main-pid value: %s", value);
+ else
+ (void) service_set_main_pid(s, pid);
+ } else if (streq(key, "main-pid-known")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse main-pid-known value: %s", value);
+ else
+ s->main_pid_known = b;
+ } else if (streq(key, "bus-name-good")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse bus-name-good value: %s", value);
+ else
+ s->bus_name_good = b;
+ } else if (streq(key, "bus-name-owner")) {
+ r = free_and_strdup(&s->bus_name_owner, value);
+ if (r < 0)
+ log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value);
+ } else if (streq(key, "status-text")) {
+ char *t;
+ ssize_t l;
+
+ l = cunescape(value, 0, &t);
+ if (l < 0)
+ log_unit_debug_errno(u, l, "Failed to unescape status text '%s': %m", value);
+ else
+ free_and_replace(s->status_text, t);
+
+ } else if (streq(key, "accept-socket")) {
+ Unit *socket;
+
+ if (u->type != UNIT_SOCKET) {
+ log_unit_debug(u, "Failed to deserialize accept-socket: unit is not a socket");
+ return 0;
+ }
+
+ r = manager_load_unit(u->manager, value, NULL, NULL, &socket);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value);
+ else {
+ unit_ref_set(&s->accept_socket, u, socket);
+ SOCKET(socket)->n_connections++;
+ }
+
+ } else if (streq(key, "socket-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse socket-fd value: %s", value);
+ else {
+ asynchronous_close(s->socket_fd);
+ s->socket_fd = fdset_remove(fds, fd);
+ }
+ } else if (streq(key, "fd-store-fd")) {
+ _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL;
+ int fd;
+ int do_poll;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0 || safe_atoi(fdv, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Failed to parse fd-store-fd value: %s", value);
+ return 0;
+ }
+
+ r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse fd-store-fd value: %s", value);
+ return 0;
+ }
+
+ r = extract_first_word(&value, &fdp, NULL, 0);
+ if (r == 0) {
+ /* If the value is not present, we assume the default */
+ do_poll = 1;
+ } else if (r < 0 || safe_atoi(fdp, &do_poll) < 0) {
+ log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\": %m", value);
+ return 0;
+ }
+
+ r = service_add_fd_store(s, fd, fdn, do_poll);
+ if (r < 0)
+ log_unit_error_errno(u, r, "Failed to add fd to store: %m");
+ else
+ fdset_remove(fds, fd);
+ } else if (streq(key, "main-exec-status-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value);
+ else
+ s->main_exec_status.pid = pid;
+ } else if (streq(key, "main-exec-status-code")) {
+ int i;
+
+ if (safe_atoi(value, &i) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value);
+ else
+ s->main_exec_status.code = i;
+ } else if (streq(key, "main-exec-status-status")) {
+ int i;
+
+ if (safe_atoi(value, &i) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value);
+ else
+ s->main_exec_status.status = i;
+ } else if (streq(key, "main-exec-status-start"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
+ else if (streq(key, "main-exec-status-exit"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+ else if (streq(key, "watchdog-timestamp"))
+ deserialize_dual_timestamp(value, &s->watchdog_timestamp);
+ else if (streq(key, "forbid-restart")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse forbid-restart value: %s", value);
+ else
+ s->forbid_restart = b;
+ } else if (streq(key, "stdin-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse stdin-fd value: %s", value);
+ else {
+ asynchronous_close(s->stdin_fd);
+ s->stdin_fd = fdset_remove(fds, fd);
+ s->exec_context.stdio_as_fds = true;
+ }
+ } else if (streq(key, "stdout-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse stdout-fd value: %s", value);
+ else {
+ asynchronous_close(s->stdout_fd);
+ s->stdout_fd = fdset_remove(fds, fd);
+ s->exec_context.stdio_as_fds = true;
+ }
+ } else if (streq(key, "stderr-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse stderr-fd value: %s", value);
+ else {
+ asynchronous_close(s->stderr_fd);
+ s->stderr_fd = fdset_remove(fds, fd);
+ s->exec_context.stdio_as_fds = true;
+ }
+ } else if (streq(key, "exec-fd")) {
+ int fd;
+
+ if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse exec-fd value: %s", value);
+ else {
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ fd = fdset_remove(fds, fd);
+ if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0)
+ safe_close(fd);
+ }
+ } else if (streq(key, "watchdog-override-usec")) {
+ if (deserialize_usec(value, &s->watchdog_override_usec) < 0)
+ log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value);
+ else
+ s->watchdog_override_enable = true;
+
+ } else if (streq(key, "watchdog-original-usec")) {
+ if (deserialize_usec(value, &s->watchdog_original_usec) < 0)
+ log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value);
+
+ } else if (STR_IN_SET(key, "main-command", "control-command")) {
+ r = service_deserialize_exec_command(u, key, value);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value);
+
+ } else if (streq(key, "n-restarts")) {
+ r = safe_atou(value, &s->n_restarts);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value);
+
+ } else if (streq(key, "flush-n-restarts")) {
+ r = parse_boolean(value);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value);
+ else
+ s->flush_n_restarts = r;
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState service_active_state(Unit *u) {
+ const UnitActiveState *table;
+
+ assert(u);
+
+ table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+ return table[SERVICE(u)->state];
+}
+
+static const char *service_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return service_state_to_string(SERVICE(u)->state);
+}
+
+static bool service_may_gc(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* Never clean up services that still have a process around, even if the service is formally dead. Note that
+ * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they
+ * have moved outside of the cgroup. */
+
+ if (main_pid_good(s) > 0 ||
+ control_pid_good(s) > 0)
+ return false;
+
+ return true;
+}
+
+static int service_retry_pid_file(Service *s) {
+ int r;
+
+ assert(s->pid_file);
+ assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+
+ r = service_load_pid_file(s, false);
+ if (r < 0)
+ return r;
+
+ service_unwatch_pid_file(s);
+
+ service_enter_running(s, SERVICE_SUCCESS);
+ return 0;
+}
+
+static int service_watch_pid_file(Service *s) {
+ int r;
+
+ log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
+
+ r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
+ if (r < 0)
+ goto fail;
+
+ /* the pidfile might have appeared just before we set the watch */
+ log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path);
+ service_retry_pid_file(s);
+
+ return 0;
+fail:
+ log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path);
+ service_unwatch_pid_file(s);
+ return r;
+}
+
+static int service_demand_pid_file(Service *s) {
+ PathSpec *ps;
+
+ assert(s->pid_file);
+ assert(!s->pid_file_pathspec);
+
+ ps = new0(PathSpec, 1);
+ if (!ps)
+ return -ENOMEM;
+
+ ps->unit = UNIT(s);
+ ps->path = strdup(s->pid_file);
+ if (!ps->path) {
+ free(ps);
+ return -ENOMEM;
+ }
+
+ path_simplify(ps->path);
+
+ /* PATH_CHANGED would not be enough. There are daemons (sendmail) that
+ * keep their PID file open all the time. */
+ ps->type = PATH_MODIFIED;
+ ps->inotify_fd = -1;
+
+ s->pid_file_pathspec = ps;
+
+ return service_watch_pid_file(s);
+}
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+ PathSpec *p = ASSERT_PTR(userdata);
+ Service *s;
+
+ s = SERVICE(p->unit);
+
+ assert(s);
+ assert(fd >= 0);
+ assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+ assert(s->pid_file_pathspec);
+ assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd));
+
+ log_unit_debug(UNIT(s), "inotify event");
+
+ if (path_spec_fd_event(p, events) < 0)
+ goto fail;
+
+ if (service_retry_pid_file(s) == 0)
+ return 0;
+
+ if (service_watch_pid_file(s) < 0)
+ goto fail;
+
+ return 0;
+
+fail:
+ service_unwatch_pid_file(s);
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ return 0;
+}
+
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+ Service *s = SERVICE(userdata);
+
+ assert(s);
+
+ log_unit_debug(UNIT(s), "got exec-fd event");
+
+ /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
+ * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
+ * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
+ * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
+ * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
+ * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
+ * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
+ * sends a zero byte we'll ignore POLLHUP on the fd again. */
+
+ for (;;) {
+ uint8_t x;
+ ssize_t n;
+
+ n = read(fd, &x, sizeof(x));
+ if (n < 0) {
+ if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
+ return 0;
+
+ return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
+ }
+ if (n == 0) { /* EOF → the event we are waiting for */
+
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
+ log_unit_debug(UNIT(s), "Got EOF on exec-fd");
+
+ s->exec_fd_hot = false;
+
+ /* Nice! This is what we have been waiting for. Transition to next state. */
+ if (s->type == SERVICE_EXEC && s->state == SERVICE_START)
+ service_enter_start_post(s);
+ } else
+ log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring.");
+
+ return 0;
+ }
+
+ /* A byte was read → this turns on/off the exec fd logic */
+ assert(n == sizeof(x));
+ s->exec_fd_hot = x;
+ }
+
+ return 0;
+}
+
+static void service_notify_cgroup_empty_event(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+
+ log_unit_debug(u, "Control group is empty.");
+
+ switch (s->state) {
+
+ /* Waiting for SIGCHLD is usually more interesting,
+ * because it includes return codes/signals. Which is
+ * why we ignore the cgroup events for most cases,
+ * except when we don't know pid which to expect the
+ * SIGCHLD for. */
+
+ case SERVICE_START:
+ if (s->type == SERVICE_NOTIFY &&
+ main_pid_good(s) == 0 &&
+ control_pid_good(s) == 0) {
+ /* No chance of getting a ready notification anymore */
+ service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+
+ if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0)
+ service_enter_start_post(s);
+
+ _fallthrough_;
+ case SERVICE_START_POST:
+ if (s->pid_file_pathspec &&
+ main_pid_good(s) == 0 &&
+ control_pid_good(s) == 0) {
+
+ /* Give up hoping for the daemon to write its PID file */
+ log_unit_warning(u, "Daemon never wrote its PID file. Failing.");
+
+ service_unwatch_pid_file(s);
+ if (s->state == SERVICE_START)
+ service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+ else
+ service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+ }
+ break;
+
+ case SERVICE_RUNNING:
+ /* service_enter_running() will figure out what to do */
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+
+ if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+ service_enter_stop_post(s, SERVICE_SUCCESS);
+
+ break;
+
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+ service_enter_dead(s, SERVICE_SUCCESS, true);
+
+ break;
+
+ /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
+ * up the cgroup earlier and should do it now. */
+ case SERVICE_DEAD:
+ case SERVICE_FAILED:
+ unit_prune_cgroup(u);
+ break;
+
+ default:
+ ;
+ }
+}
+
+static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+ Service *s = SERVICE(u);
+
+ if (managed_oom)
+ log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+ else
+ log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+ if (s->oom_policy == OOM_CONTINUE)
+ return;
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ case SERVICE_STOP:
+ if (s->oom_policy == OOM_STOP)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL);
+ else if (s->oom_policy == OOM_KILL)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+ break;
+
+ case SERVICE_EXITED:
+ case SERVICE_RUNNING:
+ if (s->oom_policy == OOM_STOP)
+ service_enter_stop(s, SERVICE_FAILURE_OOM_KILL);
+ else if (s->oom_policy == OOM_KILL)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+ break;
+
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_FINAL_SIGKILL:
+ if (s->result == SERVICE_SUCCESS)
+ s->result = SERVICE_FAILURE_OOM_KILL;
+ break;
+
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_SIGTERM:
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+ break;
+
+ default:
+ ;
+ }
+}
+
+static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ bool notify_dbus = true;
+ Service *s = SERVICE(u);
+ ServiceResult f;
+ ExitClean clean_mode;
+
+ assert(s);
+ assert(pid >= 0);
+
+ /* Oneshot services and non-SERVICE_EXEC_START commands should not be
+ * considered daemons as they are typically not long running. */
+ if (s->type == SERVICE_ONESHOT || (s->control_pid == pid && s->control_command_id != SERVICE_EXEC_START))
+ clean_mode = EXIT_CLEAN_COMMAND;
+ else
+ clean_mode = EXIT_CLEAN_DAEMON;
+
+ if (is_clean_exit(code, status, clean_mode, &s->success_status))
+ f = SERVICE_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SERVICE_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SERVICE_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SERVICE_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->main_pid == pid) {
+ /* Clean up the exec_fd event source. We want to do this here, not later in
+ * service_set_state(), because service_enter_stop_post() calls service_spawn().
+ * The source owns its end of the pipe, so this will close that too. */
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ /* Forking services may occasionally move to a new PID.
+ * As long as they update the PID file before exiting the old
+ * PID, they're fine. */
+ if (service_load_pid_file(s, false) > 0)
+ return;
+
+ s->main_pid = 0;
+ exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status);
+
+ if (s->main_command) {
+ /* If this is not a forking service than the
+ * main process got started and hence we copy
+ * the exit status so that it is recorded both
+ * as main and as control process exit
+ * status */
+
+ s->main_command->exec_status = s->main_exec_status;
+
+ if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ } else if (s->exec_command[SERVICE_EXEC_START]) {
+
+ /* If this is a forked process, then we should
+ * ignore the return value if this was
+ * configured for the starter process */
+
+ if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Main process",
+ service_exec_command_to_string(SERVICE_EXEC_START),
+ f == SERVICE_SUCCESS,
+ code, status);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->main_command &&
+ s->main_command->command_next &&
+ s->type == SERVICE_ONESHOT &&
+ f == SERVICE_SUCCESS) {
+
+ /* There is another command to execute, so let's do that. */
+
+ log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state));
+ service_run_next_main(s);
+
+ } else {
+ s->main_command = NULL;
+
+ /* Services with ExitType=cgroup do not act on main PID exiting,
+ * unless the cgroup is already empty */
+ if (s->exit_type == SERVICE_EXIT_MAIN || cgroup_good(s) <= 0) {
+ /* The service exited, so the service is officially gone. */
+ switch (s->state) {
+
+ case SERVICE_START_POST:
+ case SERVICE_RELOAD:
+ /* If neither main nor control processes are running then
+ * the current state can never exit cleanly, hence immediately
+ * terminate the service. */
+ if (control_pid_good(s) <= 0)
+ service_enter_stop(s, f);
+
+ /* Otherwise need to wait until the operation is done. */
+ break;
+
+ case SERVICE_STOP:
+ /* Need to wait until the operation is done. */
+ break;
+
+ case SERVICE_START:
+ if (s->type == SERVICE_ONESHOT) {
+ /* This was our main goal, so let's go on */
+ if (f == SERVICE_SUCCESS)
+ service_enter_start_post(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ } else if (s->type == SERVICE_NOTIFY) {
+ /* Only enter running through a notification, so that the
+ * SERVICE_START state signifies that no ready notification
+ * has been received */
+ if (f != SERVICE_SUCCESS)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ else if (!s->remain_after_exit || s->notify_access == NOTIFY_MAIN)
+ /* The service has never been and will never be active */
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+
+ _fallthrough_;
+ case SERVICE_RUNNING:
+ service_enter_running(s, f);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_stop_post(s, f);
+
+ /* If there is still a control process, wait for that first */
+ break;
+
+ case SERVICE_STOP_POST:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_dead(s, f, true);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START)
+ /* If a main process exits very quickly, this function might be executed
+ * before service_dispatch_exec_io(). Since this function disabled IO events
+ * to monitor the main process above, we need to update the state here too.
+ * Let's consider the process is successfully launched and exited. */
+ service_enter_start_post(s);
+ }
+
+ } else if (s->control_pid == pid) {
+ const char *kind;
+ bool success;
+
+ s->control_pid = 0;
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ }
+
+ /* ExecCondition= calls that exit with (0, 254] should invoke skip-like behavior instead of failing */
+ if (s->state == SERVICE_CONDITION) {
+ if (f == SERVICE_FAILURE_EXIT_CODE && status < 255) {
+ UNIT(s)->condition_result = false;
+ f = SERVICE_SKIP_CONDITION;
+ success = true;
+ } else if (f == SERVICE_SUCCESS) {
+ UNIT(s)->condition_result = true;
+ success = true;
+ } else
+ success = false;
+
+ kind = "Condition check process";
+ } else {
+ kind = "Control process";
+ success = f == SERVICE_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ kind,
+ service_exec_command_to_string(s->control_command_id),
+ success,
+ code, status);
+
+ if (s->state != SERVICE_RELOAD && s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->control_command &&
+ s->control_command->command_next &&
+ f == SERVICE_SUCCESS) {
+
+ /* There is another command to *
+ * execute, so let's do that. */
+
+ log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state));
+ service_run_next_control(s);
+
+ } else {
+ /* No further commands for this step, so let's
+ * figure out what to do next */
+
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state));
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ if (f == SERVICE_SUCCESS)
+ service_enter_start_pre(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_START_PRE:
+ if (f == SERVICE_SUCCESS)
+ service_enter_start(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_START:
+ if (s->type != SERVICE_FORKING)
+ /* Maybe spurious event due to a reload that changed the type? */
+ break;
+
+ if (f != SERVICE_SUCCESS) {
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ }
+
+ if (s->pid_file) {
+ bool has_start_post;
+ int r;
+
+ /* Let's try to load the pid file here if we can.
+ * The PID file might actually be created by a START_POST
+ * script. In that case don't worry if the loading fails. */
+
+ has_start_post = s->exec_command[SERVICE_EXEC_START_POST];
+ r = service_load_pid_file(s, !has_start_post);
+ if (!has_start_post && r < 0) {
+ r = service_demand_pid_file(s);
+ if (r < 0 || cgroup_good(s) == 0)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+ } else
+ service_search_main_pid(s);
+
+ service_enter_start_post(s);
+ break;
+
+ case SERVICE_START_POST:
+ if (f != SERVICE_SUCCESS) {
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ }
+
+ if (s->pid_file) {
+ int r;
+
+ r = service_load_pid_file(s, true);
+ if (r < 0) {
+ r = service_demand_pid_file(s);
+ if (r < 0 || cgroup_good(s) == 0)
+ service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+ } else
+ service_search_main_pid(s);
+
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_RELOAD:
+ if (f == SERVICE_SUCCESS)
+ if (service_load_pid_file(s, true) < 0)
+ service_search_main_pid(s);
+
+ s->reload_result = f;
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP:
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+ if (main_pid_good(s) <= 0)
+ service_enter_stop_post(s, f);
+
+ /* If there is still a service process around, wait until
+ * that one quit, too */
+ break;
+
+ case SERVICE_STOP_POST:
+ if (main_pid_good(s) <= 0)
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ if (main_pid_good(s) <= 0)
+ service_enter_dead(s, f, true);
+ break;
+
+ case SERVICE_CLEANING:
+
+ if (s->clean_result == SERVICE_SUCCESS)
+ s->clean_result = f;
+
+ service_enter_dead(s, SERVICE_SUCCESS, false);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+ } else /* Neither control nor main PID? If so, don't notify about anything */
+ notify_dbus = false;
+
+ /* Notify clients about changed exit status */
+ if (notify_dbus)
+ unit_add_to_dbus_queue(u);
+
+ /* We watch the main/control process otherwise we can't retrieve the unit they
+ * belong to with cgroupv1. But if they are not our direct child, we won't get a
+ * SIGCHLD for them. Therefore we need to look for others to watch so we can
+ * detect when the cgroup becomes empty. Note that the control process is always
+ * our child so it's pointless to watch all other processes. */
+ if (!control_pid_good(s))
+ if (!s->main_pid_known || s->main_pid_alien)
+ (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Service *s = SERVICE(userdata);
+
+ assert(s);
+ assert(source == s->timer_event_source);
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ switch (s->timeout_start_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "%s operation timed out. Aborting.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "%s operation timed out. Killing.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "%s operation timed out. Skipping SIGKILL.", service_state_to_string(s->state));
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_RUNNING:
+ log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping.");
+ service_enter_stop(s, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_RELOAD:
+ log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process.");
+ service_kill_control_process(s);
+ s->reload_result = SERVICE_FAILURE_TIMEOUT;
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP:
+ switch (s->timeout_stop_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "Stopping timed out. Aborting.");
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SERVICE_STOP_SIGTERM:
+ if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ } else if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+
+ break;
+
+ case SERVICE_STOP_SIGKILL:
+ /* Uh, we sent a SIGKILL and it is still not gone?
+ * Must be something we cannot kill, so let's just be
+ * weirded out and continue */
+
+ log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_STOP_POST:
+ switch (s->timeout_stop_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating.");
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+ break;
+
+ case SERVICE_FINAL_SIGTERM:
+ if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ } else if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+
+ break;
+
+ case SERVICE_FINAL_SIGKILL:
+ log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true);
+ break;
+
+ case SERVICE_AUTO_RESTART:
+ if (s->restart_usec > 0)
+ log_unit_debug(UNIT(s),
+ "Service RestartSec=%s expired, scheduling restart.",
+ FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC));
+ else
+ log_unit_debug(UNIT(s),
+ "Service has no hold-off time (RestartSec=0), scheduling restart.");
+
+ service_enter_restart(s);
+ break;
+
+ case SERVICE_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SERVICE_SUCCESS)
+ s->clean_result = SERVICE_FAILURE_TIMEOUT;
+
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) {
+ Service *s = SERVICE(userdata);
+ usec_t watchdog_usec;
+
+ assert(s);
+ assert(source == s->watchdog_event_source);
+
+ watchdog_usec = service_get_watchdog_usec(s);
+
+ if (UNIT(s)->manager->service_watchdogs) {
+ log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!",
+ FORMAT_TIMESPAN(watchdog_usec, 1));
+
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+ } else
+ log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!",
+ FORMAT_TIMESPAN(watchdog_usec, 1));
+
+ return 0;
+}
+
+static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) {
+ assert(s);
+
+ if (s->notify_access == NOTIFY_NONE) {
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid);
+ return false;
+ }
+
+ if (s->notify_access == NOTIFY_MAIN && pid != s->main_pid) {
+ if (s->main_pid != 0)
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid);
+ else
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid);
+
+ return false;
+ }
+
+ if (s->notify_access == NOTIFY_EXEC && pid != s->main_pid && pid != s->control_pid) {
+ if (s->main_pid != 0 && s->control_pid != 0)
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT,
+ pid, s->main_pid, s->control_pid);
+ else if (s->main_pid != 0)
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid);
+ else if (s->control_pid != 0)
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid);
+ else
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid);
+
+ return false;
+ }
+
+ return true;
+}
+
+static void service_force_watchdog(Service *s) {
+ if (!UNIT(s)->manager->service_watchdogs)
+ return;
+
+ log_unit_error(UNIT(s), "Watchdog request (last status: %s)!",
+ s->status_text ? s->status_text : "<unset>");
+
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+}
+
+static void service_notify_message(
+ Unit *u,
+ const struct ucred *ucred,
+ char * const *tags,
+ FDSet *fds) {
+
+ Service *s = SERVICE(u);
+ bool notify_dbus = false;
+ const char *e;
+ int r;
+
+ assert(u);
+ assert(ucred);
+
+ if (!service_notify_message_authorized(SERVICE(u), ucred->pid, fds))
+ return;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *cc = NULL;
+
+ cc = strv_join(tags, ", ");
+ log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, isempty(cc) ? "n/a" : cc);
+ }
+
+ /* Interpret MAINPID= */
+ e = strv_find_startswith(tags, "MAINPID=");
+ if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) {
+ pid_t new_main_pid;
+
+ if (parse_pid(e, &new_main_pid) < 0)
+ log_unit_warning(u, "Failed to parse MAINPID= field in notification message, ignoring: %s", e);
+ else if (!s->main_pid_known || new_main_pid != s->main_pid) {
+
+ r = service_is_suitable_main_pid(s, new_main_pid, LOG_WARNING);
+ if (r == 0) {
+ /* The new main PID is a bit suspicious, which is OK if the sender is privileged. */
+
+ if (ucred->uid == 0) {
+ log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid);
+ r = 1;
+ } else
+ log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid);
+ }
+ if (r > 0) {
+ (void) service_set_main_pid(s, new_main_pid);
+
+ r = unit_watch_pid(UNIT(s), new_main_pid, false);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", new_main_pid);
+
+ notify_dbus = true;
+ }
+ }
+ }
+
+ /* Interpret READY=/STOPPING=/RELOADING=. Last one wins. */
+ STRV_FOREACH_BACKWARDS(i, tags) {
+
+ if (streq(*i, "READY=1")) {
+ s->notify_state = NOTIFY_READY;
+
+ /* Type=notify services inform us about completed
+ * initialization with READY=1 */
+ if (s->type == SERVICE_NOTIFY && s->state == SERVICE_START)
+ service_enter_start_post(s);
+
+ /* Sending READY=1 while we are reloading informs us
+ * that the reloading is complete */
+ if (s->state == SERVICE_RELOAD && s->control_pid == 0)
+ service_enter_running(s, SERVICE_SUCCESS);
+
+ notify_dbus = true;
+ break;
+
+ } else if (streq(*i, "RELOADING=1")) {
+ s->notify_state = NOTIFY_RELOADING;
+
+ if (s->state == SERVICE_RUNNING)
+ service_enter_reload_by_notify(s);
+
+ notify_dbus = true;
+ break;
+
+ } else if (streq(*i, "STOPPING=1")) {
+ s->notify_state = NOTIFY_STOPPING;
+
+ if (s->state == SERVICE_RUNNING)
+ service_enter_stop_by_notify(s);
+
+ notify_dbus = true;
+ break;
+ }
+ }
+
+ /* Interpret STATUS= */
+ e = strv_find_startswith(tags, "STATUS=");
+ if (e) {
+ _cleanup_free_ char *t = NULL;
+
+ if (!isempty(e)) {
+ /* Note that this size limit check is mostly paranoia: since the datagram size we are willing
+ * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */
+ if (strlen(e) > STATUS_TEXT_MAX)
+ log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX);
+ else if (!utf8_is_valid(e))
+ log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring.");
+ else {
+ t = strdup(e);
+ if (!t)
+ log_oom();
+ }
+ }
+
+ if (!streq_ptr(s->status_text, t)) {
+ free_and_replace(s->status_text, t);
+ notify_dbus = true;
+ }
+ }
+
+ /* Interpret ERRNO= */
+ e = strv_find_startswith(tags, "ERRNO=");
+ if (e) {
+ int status_errno;
+
+ status_errno = parse_errno(e);
+ if (status_errno < 0)
+ log_unit_warning_errno(u, status_errno,
+ "Failed to parse ERRNO= field value '%s' in notification message: %m", e);
+ else if (s->status_errno != status_errno) {
+ s->status_errno = status_errno;
+ notify_dbus = true;
+ }
+ }
+
+ /* Interpret EXTEND_TIMEOUT= */
+ e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC=");
+ if (e) {
+ usec_t extend_timeout_usec;
+ if (safe_atou64(e, &extend_timeout_usec) < 0)
+ log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e);
+ else
+ service_extend_timeout(s, extend_timeout_usec);
+ }
+
+ /* Interpret WATCHDOG= */
+ e = strv_find_startswith(tags, "WATCHDOG=");
+ if (e) {
+ if (streq(e, "1"))
+ service_reset_watchdog(s);
+ else if (streq(e, "trigger"))
+ service_force_watchdog(s);
+ else
+ log_unit_warning(u, "Passed WATCHDOG= field is invalid, ignoring.");
+ }
+
+ e = strv_find_startswith(tags, "WATCHDOG_USEC=");
+ if (e) {
+ usec_t watchdog_override_usec;
+ if (safe_atou64(e, &watchdog_override_usec) < 0)
+ log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e);
+ else
+ service_override_watchdog_timeout(s, watchdog_override_usec);
+ }
+
+ /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases,
+ * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing
+ * fds, but optional when pushing in new fds, for compatibility reasons. */
+ if (strv_contains(tags, "FDSTOREREMOVE=1")) {
+ const char *name;
+
+ name = strv_find_startswith(tags, "FDNAME=");
+ if (!name || !fdname_is_valid(name))
+ log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring.");
+ else
+ service_remove_fd_store(s, name);
+
+ } else if (strv_contains(tags, "FDSTORE=1")) {
+ const char *name;
+
+ name = strv_find_startswith(tags, "FDNAME=");
+ if (name && !fdname_is_valid(name)) {
+ log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring.");
+ name = NULL;
+ }
+
+ (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0"));
+ }
+
+ /* Notify clients about changed status or main pid */
+ if (notify_dbus)
+ unit_add_to_dbus_queue(u);
+}
+
+static int service_get_timeout(Unit *u, usec_t *timeout) {
+ Service *s = SERVICE(u);
+ uint64_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static bool pick_up_pid_from_bus_name(Service *s) {
+ assert(s);
+
+ /* If the service is running but we have no main PID yet, get it from the owner of the D-Bus name */
+
+ return !pid_is_valid(s->main_pid) &&
+ IN_SET(s->state,
+ SERVICE_START,
+ SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD);
+}
+
+static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) {
+ const sd_bus_error *e;
+ Unit *u = ASSERT_PTR(userdata);
+ uint32_t pid;
+ Service *s;
+ int r;
+
+ assert(reply);
+
+ s = SERVICE(u);
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ if (!s->bus_name || !pick_up_pid_from_bus_name(s))
+ return 1;
+
+ e = sd_bus_message_get_error(reply);
+ if (e) {
+ r = sd_bus_error_get_errno(e);
+ log_warning_errno(r, "GetConnectionUnixProcessID() failed: %s", bus_error_message(e, r));
+ return 1;
+ }
+
+ r = sd_bus_message_read(reply, "u", &pid);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 1;
+ }
+
+ if (!pid_is_valid(pid)) {
+ log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "GetConnectionUnixProcessID() returned invalid PID");
+ return 1;
+ }
+
+ log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, (pid_t) pid);
+
+ (void) service_set_main_pid(s, pid);
+ (void) unit_watch_pid(UNIT(s), pid, false);
+ return 1;
+}
+
+static void service_bus_name_owner_change(Unit *u, const char *new_owner) {
+
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ if (new_owner)
+ log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner);
+ else
+ log_unit_debug(u, "D-Bus name %s now not owned by anyone.", s->bus_name);
+
+ s->bus_name_good = new_owner;
+
+ /* Track the current owner, so we can reconstruct changes after a daemon reload */
+ r = free_and_strdup(&s->bus_name_owner, new_owner);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner);
+ return;
+ }
+
+ if (s->type == SERVICE_DBUS) {
+
+ /* service_enter_running() will figure out what to
+ * do */
+ if (s->state == SERVICE_RUNNING)
+ service_enter_running(s, SERVICE_SUCCESS);
+ else if (s->state == SERVICE_START && new_owner)
+ service_enter_start_post(s);
+
+ } else if (new_owner && pick_up_pid_from_bus_name(s)) {
+
+ /* Try to acquire PID from bus service */
+
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ r = sd_bus_call_method_async(
+ u->manager->api_bus,
+ &s->bus_name_pid_lookup_slot,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.DBus",
+ "GetConnectionUnixProcessID",
+ bus_name_pid_lookup_callback,
+ s,
+ "s",
+ s->bus_name);
+ if (r < 0)
+ log_debug_errno(r, "Failed to request owner PID of service name, ignoring: %m");
+ }
+}
+
+int service_set_socket_fd(
+ Service *s,
+ int fd,
+ Socket *sock,
+ SocketPeer *peer,
+ bool selinux_context_net) {
+
+ _cleanup_free_ char *peer_text = NULL;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs
+ * to be configured. We take ownership of the passed fd on success. */
+
+ if (UNIT(s)->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ if (s->socket_fd >= 0)
+ return -EBUSY;
+
+ assert(!s->socket_peer);
+
+ if (s->state != SERVICE_DEAD)
+ return -EAGAIN;
+
+ if (getpeername_pretty(fd, true, &peer_text) >= 0) {
+
+ if (UNIT(s)->description) {
+ _cleanup_free_ char *a = NULL;
+
+ a = strjoin(UNIT(s)->description, " (", peer_text, ")");
+ if (!a)
+ return -ENOMEM;
+
+ r = unit_set_description(UNIT(s), a);
+ } else
+ r = unit_set_description(UNIT(s), peer_text);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+
+ s->socket_fd = fd;
+ s->socket_peer = socket_peer_ref(peer);
+ s->socket_fd_selinux_context_net = selinux_context_net;
+
+ unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock));
+ return 0;
+}
+
+static void service_reset_failed(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ if (s->state == SERVICE_FAILED)
+ service_set_state(s, SERVICE_DEAD);
+
+ s->result = SERVICE_SUCCESS;
+ s->reload_result = SERVICE_SUCCESS;
+ s->clean_result = SERVICE_SUCCESS;
+ s->n_restarts = 0;
+ s->flush_n_restarts = false;
+}
+
+static int service_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return unit_kill_common(u, who, signo, s->main_pid, s->control_pid, error);
+}
+
+static int service_main_pid(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return s->main_pid;
+}
+
+static int service_control_pid(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return s->control_pid;
+}
+
+static bool service_needs_console(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* We provide our own implementation of this here, instead of relying of the generic implementation
+ * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */
+
+ if (!exec_context_may_touch_console(&s->exec_context))
+ return false;
+
+ return IN_SET(s->state,
+ SERVICE_CONDITION,
+ SERVICE_START_PRE,
+ SERVICE_START,
+ SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD,
+ SERVICE_STOP,
+ SERVICE_STOP_WATCHDOG,
+ SERVICE_STOP_SIGTERM,
+ SERVICE_STOP_SIGKILL,
+ SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG,
+ SERVICE_FINAL_SIGTERM,
+ SERVICE_FINAL_SIGKILL);
+}
+
+static int service_exit_status(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+
+ if (s->main_exec_status.pid <= 0 ||
+ !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp))
+ return -ENODATA;
+
+ if (s->main_exec_status.code != CLD_EXITED)
+ return -EBADE;
+
+ return s->main_exec_status.status;
+}
+
+static const char* service_status_text(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return s->status_text;
+}
+
+static int service_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (s->state != SERVICE_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ service_unwatch_control_pid(s);
+ s->clean_result = SERVICE_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec));
+ if (r < 0)
+ goto fail;
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ service_set_state(s, SERVICE_CLEANING);
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m");
+ s->clean_result = SERVICE_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int service_can_clean(Unit *u, ExecCleanMask *ret) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static const char *service_finished_job(Unit *u, JobType t, JobResult result) {
+ if (t == JOB_START &&
+ result == JOB_DONE &&
+ SERVICE(u)->type == SERVICE_ONESHOT)
+ return "Finished %s.";
+
+ /* Fall back to generic */
+ return NULL;
+}
+
+static int service_can_start(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ /* Make sure we don't enter a busy loop of some kind. */
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const service_restart_table[_SERVICE_RESTART_MAX] = {
+ [SERVICE_RESTART_NO] = "no",
+ [SERVICE_RESTART_ON_SUCCESS] = "on-success",
+ [SERVICE_RESTART_ON_FAILURE] = "on-failure",
+ [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal",
+ [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog",
+ [SERVICE_RESTART_ON_ABORT] = "on-abort",
+ [SERVICE_RESTART_ALWAYS] = "always",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart);
+
+static const char* const service_type_table[_SERVICE_TYPE_MAX] = {
+ [SERVICE_SIMPLE] = "simple",
+ [SERVICE_FORKING] = "forking",
+ [SERVICE_ONESHOT] = "oneshot",
+ [SERVICE_DBUS] = "dbus",
+ [SERVICE_NOTIFY] = "notify",
+ [SERVICE_IDLE] = "idle",
+ [SERVICE_EXEC] = "exec",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType);
+
+static const char* const service_exit_type_table[_SERVICE_EXIT_TYPE_MAX] = {
+ [SERVICE_EXIT_MAIN] = "main",
+ [SERVICE_EXIT_CGROUP] = "cgroup",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exit_type, ServiceExitType);
+
+static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+ [SERVICE_EXEC_CONDITION] = "ExecCondition",
+ [SERVICE_EXEC_START_PRE] = "ExecStartPre",
+ [SERVICE_EXEC_START] = "ExecStart",
+ [SERVICE_EXEC_START_POST] = "ExecStartPost",
+ [SERVICE_EXEC_RELOAD] = "ExecReload",
+ [SERVICE_EXEC_STOP] = "ExecStop",
+ [SERVICE_EXEC_STOP_POST] = "ExecStopPost",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand);
+
+static const char* const service_exec_ex_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+ [SERVICE_EXEC_CONDITION] = "ExecConditionEx",
+ [SERVICE_EXEC_START_PRE] = "ExecStartPreEx",
+ [SERVICE_EXEC_START] = "ExecStartEx",
+ [SERVICE_EXEC_START_POST] = "ExecStartPostEx",
+ [SERVICE_EXEC_RELOAD] = "ExecReloadEx",
+ [SERVICE_EXEC_STOP] = "ExecStopEx",
+ [SERVICE_EXEC_STOP_POST] = "ExecStopPostEx",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_ex_command, ServiceExecCommand);
+
+static const char* const notify_state_table[_NOTIFY_STATE_MAX] = {
+ [NOTIFY_UNKNOWN] = "unknown",
+ [NOTIFY_READY] = "ready",
+ [NOTIFY_RELOADING] = "reloading",
+ [NOTIFY_STOPPING] = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState);
+
+static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
+ [SERVICE_SUCCESS] = "success",
+ [SERVICE_FAILURE_RESOURCES] = "resources",
+ [SERVICE_FAILURE_PROTOCOL] = "protocol",
+ [SERVICE_FAILURE_TIMEOUT] = "timeout",
+ [SERVICE_FAILURE_EXIT_CODE] = "exit-code",
+ [SERVICE_FAILURE_SIGNAL] = "signal",
+ [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
+ [SERVICE_FAILURE_WATCHDOG] = "watchdog",
+ [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [SERVICE_FAILURE_OOM_KILL] = "oom-kill",
+ [SERVICE_SKIP_CONDITION] = "exec-condition",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
+
+static const char* const service_timeout_failure_mode_table[_SERVICE_TIMEOUT_FAILURE_MODE_MAX] = {
+ [SERVICE_TIMEOUT_TERMINATE] = "terminate",
+ [SERVICE_TIMEOUT_ABORT] = "abort",
+ [SERVICE_TIMEOUT_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+const UnitVTable service_vtable = {
+ .object_size = sizeof(Service),
+ .exec_context_offset = offsetof(Service, exec_context),
+ .cgroup_context_offset = offsetof(Service, cgroup_context),
+ .kill_context_offset = offsetof(Service, kill_context),
+ .exec_runtime_offset = offsetof(Service, exec_runtime),
+ .dynamic_creds_offset = offsetof(Service, dynamic_creds),
+
+ .sections =
+ "Unit\0"
+ "Service\0"
+ "Install\0",
+ .private_section = "Service",
+
+ .can_transient = true,
+ .can_delegate = true,
+ .can_fail = true,
+ .can_set_managed_oom = true,
+
+ .init = service_init,
+ .done = service_done,
+ .load = service_load,
+ .release_resources = service_release_resources,
+
+ .coldplug = service_coldplug,
+
+ .dump = service_dump,
+
+ .start = service_start,
+ .stop = service_stop,
+ .reload = service_reload,
+
+ .can_reload = service_can_reload,
+
+ .kill = service_kill,
+ .clean = service_clean,
+ .can_clean = service_can_clean,
+
+ .freeze = unit_freeze_vtable_common,
+ .thaw = unit_thaw_vtable_common,
+
+ .serialize = service_serialize,
+ .deserialize_item = service_deserialize_item,
+
+ .active_state = service_active_state,
+ .sub_state_to_string = service_sub_state_to_string,
+
+ .will_restart = service_will_restart,
+
+ .may_gc = service_may_gc,
+
+ .sigchld_event = service_sigchld_event,
+
+ .reset_failed = service_reset_failed,
+
+ .notify_cgroup_empty = service_notify_cgroup_empty_event,
+ .notify_cgroup_oom = service_notify_cgroup_oom_event,
+ .notify_message = service_notify_message,
+
+ .main_pid = service_main_pid,
+ .control_pid = service_control_pid,
+
+ .bus_name_owner_change = service_bus_name_owner_change,
+
+ .bus_set_property = bus_service_set_property,
+ .bus_commit_properties = bus_service_commit_properties,
+
+ .get_timeout = service_get_timeout,
+ .needs_console = service_needs_console,
+ .exit_status = service_exit_status,
+ .status_text = service_status_text,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_FAILED] = "Failed to start %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Stopped %s.",
+ [JOB_FAILED] = "Stopped (with error) %s.",
+ },
+ .finished_job = service_finished_job,
+ },
+
+ .can_start = service_can_start,
+};
diff --git a/src/core/service.h b/src/core/service.h
new file mode 100644
index 0000000..91e02e6
--- /dev/null
+++ b/src/core/service.h
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Service Service;
+typedef struct ServiceFDStore ServiceFDStore;
+
+#include "exit-status.h"
+#include "kill.h"
+#include "path.h"
+#include "ratelimit.h"
+#include "socket.h"
+#include "unit.h"
+
+typedef enum ServiceRestart {
+ SERVICE_RESTART_NO,
+ SERVICE_RESTART_ON_SUCCESS,
+ SERVICE_RESTART_ON_FAILURE,
+ SERVICE_RESTART_ON_ABNORMAL,
+ SERVICE_RESTART_ON_WATCHDOG,
+ SERVICE_RESTART_ON_ABORT,
+ SERVICE_RESTART_ALWAYS,
+ _SERVICE_RESTART_MAX,
+ _SERVICE_RESTART_INVALID = -EINVAL,
+} ServiceRestart;
+
+typedef enum ServiceType {
+ SERVICE_SIMPLE, /* we fork and go on right-away (i.e. modern socket activated daemons) */
+ SERVICE_FORKING, /* forks by itself (i.e. traditional daemons) */
+ SERVICE_ONESHOT, /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */
+ SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */
+ SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */
+ SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */
+ SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */
+ _SERVICE_TYPE_MAX,
+ _SERVICE_TYPE_INVALID = -EINVAL,
+} ServiceType;
+
+typedef enum ServiceExitType {
+ SERVICE_EXIT_MAIN, /* we consider the main PID when deciding if the service exited */
+ SERVICE_EXIT_CGROUP, /* we wait for the last process in the cgroup to exit */
+ _SERVICE_EXIT_TYPE_MAX,
+ _SERVICE_EXIT_TYPE_INVALID = -EINVAL,
+} ServiceExitType;
+
+typedef enum ServiceExecCommand {
+ SERVICE_EXEC_CONDITION,
+ SERVICE_EXEC_START_PRE,
+ SERVICE_EXEC_START,
+ SERVICE_EXEC_START_POST,
+ SERVICE_EXEC_RELOAD,
+ SERVICE_EXEC_STOP,
+ SERVICE_EXEC_STOP_POST,
+ _SERVICE_EXEC_COMMAND_MAX,
+ _SERVICE_EXEC_COMMAND_INVALID = -EINVAL,
+} ServiceExecCommand;
+
+typedef enum NotifyState {
+ NOTIFY_UNKNOWN,
+ NOTIFY_READY,
+ NOTIFY_RELOADING,
+ NOTIFY_STOPPING,
+ _NOTIFY_STATE_MAX,
+ _NOTIFY_STATE_INVALID = -EINVAL,
+} NotifyState;
+
+/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c.
+ * Update those sources for each change to this enum. */
+typedef enum ServiceResult {
+ SERVICE_SUCCESS,
+ SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+ SERVICE_FAILURE_PROTOCOL,
+ SERVICE_FAILURE_TIMEOUT,
+ SERVICE_FAILURE_EXIT_CODE,
+ SERVICE_FAILURE_SIGNAL,
+ SERVICE_FAILURE_CORE_DUMP,
+ SERVICE_FAILURE_WATCHDOG,
+ SERVICE_FAILURE_START_LIMIT_HIT,
+ SERVICE_FAILURE_OOM_KILL, /* OOM Kill by the Kernel or systemd-oomd */
+ SERVICE_SKIP_CONDITION,
+ _SERVICE_RESULT_MAX,
+ _SERVICE_RESULT_INVALID = -EINVAL,
+} ServiceResult;
+
+typedef enum ServiceTimeoutFailureMode {
+ SERVICE_TIMEOUT_TERMINATE,
+ SERVICE_TIMEOUT_ABORT,
+ SERVICE_TIMEOUT_KILL,
+ _SERVICE_TIMEOUT_FAILURE_MODE_MAX,
+ _SERVICE_TIMEOUT_FAILURE_MODE_INVALID = -EINVAL,
+} ServiceTimeoutFailureMode;
+
+struct ServiceFDStore {
+ Service *service;
+
+ int fd;
+ char *fdname;
+ sd_event_source *event_source;
+ bool do_poll;
+
+ LIST_FIELDS(ServiceFDStore, fd_store);
+};
+
+struct Service {
+ Unit meta;
+
+ ServiceType type;
+ ServiceExitType exit_type;
+ ServiceRestart restart;
+ ExitStatusSet restart_prevent_status;
+ ExitStatusSet restart_force_status;
+ ExitStatusSet success_status;
+
+ /* If set we'll read the main daemon PID from this file */
+ char *pid_file;
+
+ usec_t restart_usec;
+ usec_t timeout_start_usec;
+ usec_t timeout_stop_usec;
+ usec_t timeout_abort_usec;
+ bool timeout_abort_set;
+ usec_t runtime_max_usec;
+ usec_t runtime_rand_extra_usec;
+ ServiceTimeoutFailureMode timeout_start_failure_mode;
+ ServiceTimeoutFailureMode timeout_stop_failure_mode;
+
+ dual_timestamp watchdog_timestamp;
+ usec_t watchdog_usec; /* the requested watchdog timeout in the unit file */
+ usec_t watchdog_original_usec; /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */
+ usec_t watchdog_override_usec; /* the watchdog timeout requested by the service itself through sd_notify() */
+ bool watchdog_override_enable;
+ sd_event_source *watchdog_event_source;
+
+ ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX];
+
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ServiceState state, deserialized_state;
+
+ /* The exit status of the real main process */
+ ExecStatus main_exec_status;
+
+ /* The currently executed control process */
+ ExecCommand *control_command;
+
+ /* The currently executed main process, which may be NULL if
+ * the main process got started via forking mode and not by
+ * us */
+ ExecCommand *main_command;
+
+ /* The ID of the control command currently being executed */
+ ServiceExecCommand control_command_id;
+
+ /* Runtime data of the execution context */
+ ExecRuntime *exec_runtime;
+ DynamicCreds dynamic_creds;
+
+ pid_t main_pid, control_pid;
+
+ /* if we are a socket activated service instance, store information of the connection/peer/socket */
+ int socket_fd;
+ SocketPeer *socket_peer;
+ UnitRef accept_socket;
+ bool socket_fd_selinux_context_net;
+
+ bool permissions_start_only;
+ bool root_directory_start_only;
+ bool remain_after_exit;
+ bool guess_main_pid;
+
+ /* If we shut down, remember why */
+ ServiceResult result;
+ ServiceResult reload_result;
+ ServiceResult clean_result;
+
+ bool main_pid_known:1;
+ bool main_pid_alien:1;
+ bool bus_name_good:1;
+ bool forbid_restart:1;
+ /* Keep restart intention between UNIT_FAILED and UNIT_ACTIVATING */
+ bool will_auto_restart:1;
+ bool start_timeout_defined:1;
+ bool exec_fd_hot:1;
+
+ char *bus_name;
+ char *bus_name_owner; /* unique name of the current owner */
+
+ char *status_text;
+ int status_errno;
+
+ sd_event_source *timer_event_source;
+ PathSpec *pid_file_pathspec;
+
+ NotifyAccess notify_access;
+ NotifyState notify_state;
+
+ sd_bus_slot *bus_name_pid_lookup_slot;
+
+ sd_event_source *exec_fd_event_source;
+
+ ServiceFDStore *fd_store;
+ size_t n_fd_store;
+ unsigned n_fd_store_max;
+ unsigned n_keep_fd_store;
+
+ char *usb_function_descriptors;
+ char *usb_function_strings;
+
+ int stdin_fd;
+ int stdout_fd;
+ int stderr_fd;
+
+ unsigned n_restarts;
+ bool flush_n_restarts;
+
+ OOMPolicy oom_policy;
+};
+
+static inline usec_t service_timeout_abort_usec(Service *s) {
+ assert(s);
+ return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec;
+}
+
+static inline usec_t service_get_watchdog_usec(Service *s) {
+ assert(s);
+ return s->watchdog_override_enable ? s->watchdog_override_usec : s->watchdog_original_usec;
+}
+
+extern const UnitVTable service_vtable;
+
+int service_set_socket_fd(Service *s, int fd, struct Socket *socket, struct SocketPeer *peer, bool selinux_context_net);
+void service_close_socket_fd(Service *s);
+
+const char* service_restart_to_string(ServiceRestart i) _const_;
+ServiceRestart service_restart_from_string(const char *s) _pure_;
+
+const char* service_type_to_string(ServiceType i) _const_;
+ServiceType service_type_from_string(const char *s) _pure_;
+
+const char* service_exit_type_to_string(ServiceExitType i) _const_;
+ServiceExitType service_exit_type_from_string(const char *s) _pure_;
+
+const char* service_exec_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_command_from_string(const char *s) _pure_;
+
+const char* service_exec_ex_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_ex_command_from_string(const char *s) _pure_;
+
+const char* notify_state_to_string(NotifyState i) _const_;
+NotifyState notify_state_from_string(const char *s) _pure_;
+
+const char* service_result_to_string(ServiceResult i) _const_;
+ServiceResult service_result_from_string(const char *s) _pure_;
+
+const char* service_timeout_failure_mode_to_string(ServiceTimeoutFailureMode i) _const_;
+ServiceTimeoutFailureMode service_timeout_failure_mode_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SERVICE, Service);
+
+#define STATUS_TEXT_MAX (16U*1024U)
+
+/* Only exported for unit tests */
+int service_deserialize_exec_command(Unit *u, const char *key, const char *value);
diff --git a/src/core/show-status.c b/src/core/show-status.c
new file mode 100644
index 0000000..ed5d6f0
--- /dev/null
+++ b/src/core/show-status.c
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "io-util.h"
+#include "parse-util.h"
+#include "show-status.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "util.h"
+
+static const char* const show_status_table[_SHOW_STATUS_MAX] = {
+ [SHOW_STATUS_NO] = "no",
+ [SHOW_STATUS_ERROR] = "error",
+ [SHOW_STATUS_AUTO] = "auto",
+ [SHOW_STATUS_TEMPORARY] = "temporary",
+ [SHOW_STATUS_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES);
+
+int parse_show_status(const char *v, ShowStatus *ret) {
+ ShowStatus s;
+
+ assert(ret);
+
+ s = show_status_from_string(v);
+ if (s < 0 || s == SHOW_STATUS_TEMPORARY)
+ return -EINVAL;
+
+ *ret = s;
+ return 0;
+}
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) {
+ static const char status_indent[] = " "; /* "[" STATUS "] " */
+ _cleanup_free_ char *s = NULL;
+ _cleanup_close_ int fd = -1;
+ struct iovec iovec[7] = {};
+ int n = 0;
+ static bool prev_ephemeral;
+
+ assert(format);
+
+ /* This is independent of logging, as status messages are
+ * optional and go exclusively to the console. */
+
+ if (vasprintf(&s, format, ap) < 0)
+ return log_oom();
+
+ /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This
+ * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely,
+ * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier
+ * for us so that we don't have to recover from hangups and suchlike triggered on the console. */
+
+ fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) {
+ char *e;
+ size_t emax, sl;
+ int c;
+
+ c = fd_columns(fd);
+ if (c <= 0)
+ c = 80;
+
+ sl = status ? sizeof(status_indent)-1 : 0;
+
+ emax = c - sl - 1;
+ if (emax < 3)
+ emax = 3;
+
+ e = ellipsize(s, emax, 50);
+ if (e)
+ free_and_replace(s, e);
+ }
+
+ if (prev_ephemeral)
+ iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE);
+
+ if (status) {
+ if (!isempty(status)) {
+ iovec[n++] = IOVEC_MAKE_STRING("[");
+ iovec[n++] = IOVEC_MAKE_STRING(status);
+ iovec[n++] = IOVEC_MAKE_STRING("] ");
+ } else
+ iovec[n++] = IOVEC_MAKE_STRING(status_indent);
+ }
+
+ iovec[n++] = IOVEC_MAKE_STRING(s);
+ iovec[n++] = IOVEC_MAKE_STRING("\r\n"); /* use CRNL instead of just NL, to be robust towards TTYs in raw mode */
+
+ if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL))
+ iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE);
+ prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL) ;
+
+ if (writev(fd, iovec, n) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) {
+ va_list ap;
+ int r;
+
+ assert(format);
+
+ va_start(ap, format);
+ r = status_vprintf(status, flags, format, ap);
+ va_end(ap);
+
+ return r;
+}
+
+static const char* const status_unit_format_table[_STATUS_UNIT_FORMAT_MAX] = {
+ [STATUS_UNIT_FORMAT_NAME] = "name",
+ [STATUS_UNIT_FORMAT_DESCRIPTION] = "description",
+ [STATUS_UNIT_FORMAT_COMBINED] = "combined",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(status_unit_format, StatusUnitFormat);
diff --git a/src/core/show-status.h b/src/core/show-status.h
new file mode 100644
index 0000000..f441223
--- /dev/null
+++ b/src/core/show-status.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+/* Manager status */
+
+typedef enum ShowStatus {
+ SHOW_STATUS_NO, /* printing of status is disabled */
+ SHOW_STATUS_ERROR, /* only print errors */
+ SHOW_STATUS_AUTO, /* disabled but may flip to _TEMPORARY */
+ SHOW_STATUS_TEMPORARY, /* enabled temporarily, may flip back to _AUTO */
+ SHOW_STATUS_YES, /* printing of status is enabled */
+ _SHOW_STATUS_MAX,
+ _SHOW_STATUS_INVALID = -EINVAL,
+} ShowStatus;
+
+typedef enum ShowStatusFlags {
+ SHOW_STATUS_ELLIPSIZE = 1 << 0,
+ SHOW_STATUS_EPHEMERAL = 1 << 1,
+} ShowStatusFlags;
+
+typedef enum StatusUnitFormat {
+ STATUS_UNIT_FORMAT_NAME,
+ STATUS_UNIT_FORMAT_DESCRIPTION,
+ STATUS_UNIT_FORMAT_COMBINED,
+ _STATUS_UNIT_FORMAT_MAX,
+ _STATUS_UNIT_FORMAT_INVALID = -EINVAL,
+} StatusUnitFormat;
+
+static inline bool show_status_on(ShowStatus s) {
+ return IN_SET(s, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES);
+}
+ShowStatus show_status_from_string(const char *v) _const_;
+const char* show_status_to_string(ShowStatus s) _pure_;
+int parse_show_status(const char *v, ShowStatus *ret);
+
+StatusUnitFormat status_unit_format_from_string(const char *v) _const_;
+const char* status_unit_format_to_string(StatusUnitFormat s) _pure_;
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0);
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4);
diff --git a/src/core/slice.c b/src/core/slice.c
new file mode 100644
index 0000000..a6c71e8
--- /dev/null
+++ b/src/core/slice.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "dbus-slice.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "log.h"
+#include "serialize.h"
+#include "slice.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
+ [SLICE_DEAD] = UNIT_INACTIVE,
+ [SLICE_ACTIVE] = UNIT_ACTIVE
+};
+
+static void slice_init(Unit *u) {
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ u->ignore_on_isolate = true;
+}
+
+static void slice_set_state(Slice *t, SliceState state) {
+ SliceState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s",
+ UNIT(t)->id,
+ slice_state_to_string(old_state),
+ slice_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static int slice_add_parent_slice(Slice *s) {
+ Unit *u = UNIT(s);
+ _cleanup_free_ char *a = NULL;
+ int r;
+
+ assert(s);
+
+ if (UNIT_GET_SLICE(u))
+ return 0;
+
+ r = slice_build_parent_slice(u->id, &a);
+ if (r <= 0) /* 0 means root slice */
+ return r;
+
+ return unit_add_dependency_by_name(u, UNIT_IN_SLICE, a, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int slice_add_default_dependencies(Slice *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Make sure slices are unloaded on shutdown */
+ r = unit_add_two_dependencies_by_name(
+ UNIT(s),
+ UNIT_BEFORE, UNIT_CONFLICTS,
+ SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int slice_verify(Slice *s) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (!slice_name_is_valid(UNIT(s)->id))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Slice name %s is not valid. Refusing.", UNIT(s)->id);
+
+ r = slice_build_parent_slice(UNIT(s)->id, &parent);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m");
+
+ /* If recursive errors are to be ignored, the parent slice should not be verified */
+ if (UNIT(s)->manager && FLAGS_SET(UNIT(s)->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ if (parent ? !unit_has_name(UNIT_GET_SLICE(UNIT(s)), parent) : !!UNIT_GET_SLICE(UNIT(s)))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Located outside of parent slice. Refusing.");
+
+ return 0;
+}
+
+static int slice_load_root_slice(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ u->perpetual = true;
+
+ /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its
+ * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ if (!u->description)
+ u->description = strdup("Root Slice");
+ if (!u->documentation)
+ u->documentation = strv_new("man:systemd.special(7)");
+
+ return 1;
+}
+
+static int slice_load_system_slice(Unit *u) {
+ assert(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+ if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE))
+ return 0;
+
+ u->perpetual = true;
+
+ /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its
+ * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ if (!u->description)
+ u->description = strdup("System Slice");
+ if (!u->documentation)
+ u->documentation = strv_new("man:systemd.special(7)");
+
+ return 1;
+}
+
+static int slice_load(Unit *u) {
+ Slice *s = SLICE(u);
+ int r;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ r = slice_load_root_slice(u);
+ if (r < 0)
+ return r;
+ r = slice_load_system_slice(u);
+ if (r < 0)
+ return r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ r = slice_add_parent_slice(s);
+ if (r < 0)
+ return r;
+
+ r = slice_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ if (!u->description) {
+ _cleanup_free_ char *tmp = NULL;
+
+ r = unit_name_to_path(u->id, &tmp);
+ if (r >= 0) /* Failure is ignored… */
+ u->description = strjoin("Slice ", tmp);
+ }
+
+ return slice_verify(s);
+}
+
+static int slice_coldplug(Unit *u) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(t->state == SLICE_DEAD);
+
+ if (t->deserialized_state != t->state)
+ slice_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void slice_dump(Unit *u, FILE *f, const char *prefix) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(f);
+
+ fprintf(f,
+ "%sSlice State: %s\n",
+ prefix, slice_state_to_string(t->state));
+
+ cgroup_context_dump(UNIT(t), f, prefix);
+}
+
+static int slice_start(Unit *u) {
+ Slice *t = SLICE(u);
+ int r;
+
+ assert(t);
+ assert(t->state == SLICE_DEAD);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ (void) unit_realize_cgroup(u);
+ (void) unit_reset_accounting(u);
+
+ slice_set_state(t, SLICE_ACTIVE);
+ return 1;
+}
+
+static int slice_stop(Unit *u) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(t->state == SLICE_ACTIVE);
+
+ /* We do not need to destroy the cgroup explicitly,
+ * unit_notify() will do that for us anyway. */
+
+ slice_set_state(t, SLICE_DEAD);
+ return 1;
+}
+
+static int slice_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ return unit_kill_common(u, who, signo, -1, -1, error);
+}
+
+static int slice_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Slice *s = SLICE(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", slice_state_to_string(s->state));
+
+ return 0;
+}
+
+static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Slice *s = SLICE(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ SliceState state;
+
+ state = slice_state_from_string(value);
+ if (state < 0)
+ log_debug("Failed to parse state value %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else
+ log_debug("Unknown serialization key '%s'", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState slice_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SLICE(u)->state];
+}
+
+_pure_ static const char *slice_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return slice_state_to_string(SLICE(u)->state);
+}
+
+static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) {
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ u = manager_get_unit(m, name);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Slice), name, &u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate the special %s unit: %m", name);
+ }
+
+ u->perpetual = true;
+ SLICE(u)->deserialized_state = SLICE_ACTIVE;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+
+ if (ret)
+ *ret = u;
+
+ return 0;
+}
+
+static void slice_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u);
+ if (r >= 0 && manager_owns_host_root_cgroup(m)) {
+ Slice *s = SLICE(u);
+
+ /* If we are managing the root cgroup then this means our root slice covers the whole system, which
+ * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's
+ * hence turn accounting on here, so that our APIs to query this data are available. */
+
+ s->cgroup_context.cpu_accounting = true;
+ s->cgroup_context.tasks_accounting = true;
+ s->cgroup_context.memory_accounting = true;
+ }
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL);
+}
+
+static bool slice_freezer_action_supported_by_children(Unit *s) {
+ Unit *member;
+ int r;
+
+ assert(s);
+
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+
+ if (member->type == UNIT_SLICE) {
+ r = slice_freezer_action_supported_by_children(member);
+ if (!r)
+ return r;
+ }
+
+ if (!UNIT_VTABLE(member)->freeze)
+ return false;
+ }
+
+ return true;
+}
+
+static int slice_freezer_action(Unit *s, FreezerAction action) {
+ Unit *member;
+ int r;
+
+ assert(s);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) {
+ log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice");
+ return 0;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+ if (!member->cgroup_realized)
+ continue;
+
+ if (action == FREEZER_FREEZE)
+ r = UNIT_VTABLE(member)->freeze(member);
+ else if (UNIT_VTABLE(member)->thaw)
+ r = UNIT_VTABLE(member)->thaw(member);
+ else
+ /* Thawing is requested but no corresponding method is available, ignore. */
+ r = 0;
+ if (r < 0)
+ return r;
+ }
+
+ return unit_cgroup_freezer_action(s, action);
+}
+
+static int slice_freeze(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action(s, FREEZER_FREEZE);
+}
+
+static int slice_thaw(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action(s, FREEZER_THAW);
+}
+
+static bool slice_can_freeze(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action_supported_by_children(s);
+}
+
+const UnitVTable slice_vtable = {
+ .object_size = sizeof(Slice),
+ .cgroup_context_offset = offsetof(Slice, cgroup_context),
+
+ .sections =
+ "Unit\0"
+ "Slice\0"
+ "Install\0",
+ .private_section = "Slice",
+
+ .can_transient = true,
+ .can_set_managed_oom = true,
+
+ .init = slice_init,
+ .load = slice_load,
+
+ .coldplug = slice_coldplug,
+
+ .dump = slice_dump,
+
+ .start = slice_start,
+ .stop = slice_stop,
+
+ .kill = slice_kill,
+
+ .freeze = slice_freeze,
+ .thaw = slice_thaw,
+ .can_freeze = slice_can_freeze,
+
+ .serialize = slice_serialize,
+ .deserialize_item = slice_deserialize_item,
+
+ .active_state = slice_active_state,
+ .sub_state_to_string = slice_sub_state_to_string,
+
+ .bus_set_property = bus_slice_set_property,
+ .bus_commit_properties = bus_slice_commit_properties,
+
+ .enumerate_perpetual = slice_enumerate_perpetual,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Created slice %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Removed slice %s.",
+ },
+ },
+};
diff --git a/src/core/slice.h b/src/core/slice.h
new file mode 100644
index 0000000..e2f9274
--- /dev/null
+++ b/src/core/slice.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Slice Slice;
+
+struct Slice {
+ Unit meta;
+
+ SliceState state, deserialized_state;
+
+ CGroupContext cgroup_context;
+};
+
+extern const UnitVTable slice_vtable;
+
+DEFINE_CAST(SLICE, Slice);
diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c
new file mode 100644
index 0000000..b76bb74
--- /dev/null
+++ b/src/core/smack-setup.c
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2013 Intel Corporation
+ Authors:
+ Nathaniel Chen <nathaniel.chen@intel.com>
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "smack-setup.h"
+#include "string-util.h"
+#include "util.h"
+
+#if ENABLE_SMACK
+
+static int fdopen_unlocked_at(int dfd, const char *dir, const char *name, int *status, FILE **ret_file) {
+ int fd, r;
+ FILE *f;
+
+ fd = openat(dfd, name, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (*status == 0)
+ *status = -errno;
+
+ return log_warning_errno(errno, "Failed to open \"%s/%s\": %m", dir, name);
+ }
+
+ r = fdopen_unlocked(fd, "r", &f);
+ if (r < 0) {
+ if (*status == 0)
+ *status = r;
+
+ safe_close(fd);
+ return log_error_errno(r, "Failed to open \"%s/%s\": %m", dir, name);
+ }
+
+ *ret_file = f;
+ return 0;
+}
+
+static int write_access2_rules(const char *srcdir) {
+ _cleanup_close_ int load2_fd = -1, change_fd = -1;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -1, r = 0;
+
+ load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (load2_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m");
+ return -errno; /* negative error */
+ }
+
+ change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (change_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to load2 or change-rule from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (!dirent_is_file(entry))
+ continue;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* load2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, buf[0]))
+ continue;
+
+ /* if 3 args -> load rule : subject object access1 */
+ /* if 4 args -> change rule : subject object access1 access2 */
+ if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) {
+ log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name);
+ continue;
+ }
+
+ if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) {
+ if (r == 0)
+ r = -errno;
+ log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m",
+ buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name);
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_cipso2_rules(const char *srcdir) {
+ _cleanup_close_ int cipso2_fd = -1;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -1, r = 0;
+
+ cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (cipso2_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to cipso2 from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (!dirent_is_file(entry))
+ continue;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* cipso2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, buf[0]))
+ continue;
+
+ if (write(cipso2_fd, buf, strlen(buf)) < 0) {
+ if (r == 0)
+ r = -errno;
+ log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m",
+ buf, entry->d_name);
+ break;
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_netlabel_rules(const char *srcdir) {
+ _cleanup_fclose_ FILE *dst = NULL;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -1, r = 0;
+
+ dst = fopen("/sys/fs/smackfs/netlabel", "we");
+ if (!dst) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to dst from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir %s: %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* load2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (!fputs(buf, dst)) {
+ if (r == 0)
+ r = -EINVAL;
+ log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m");
+ break;
+ }
+ q = fflush_and_check(dst);
+ if (q < 0) {
+ if (r == 0)
+ r = q;
+ log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m");
+ break;
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_onlycap_list(void) {
+ _cleanup_close_ int onlycap_fd = -1;
+ _cleanup_free_ char *list = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t len = 0;
+ int r;
+
+ f = fopen("/etc/smack/onlycap", "re");
+ if (!f) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m");
+
+ return errno == ENOENT ? ENOENT : -errno;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ size_t l;
+
+ r = read_line(f, LONG_LINE_MAX, &buf);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m");
+ if (r == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, *buf))
+ continue;
+
+ l = strlen(buf);
+ if (!GREEDY_REALLOC(list, len + l + 1))
+ return log_oom();
+
+ stpcpy(list + len, buf)[0] = ' ';
+ len += l + 1;
+ }
+
+ if (len == 0)
+ return 0;
+
+ list[len - 1] = 0;
+
+ onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (onlycap_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m");
+ return -errno; /* negative error */
+ }
+
+ r = write(onlycap_fd, list, len);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list);
+
+ return 0;
+}
+
+#endif
+
+int mac_smack_setup(bool *loaded_policy) {
+
+#if ENABLE_SMACK
+
+ int r;
+
+ assert(loaded_policy);
+
+ r = write_access2_rules("/etc/smack/accesses.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found");
+ return 0;
+ case 0:
+ log_info("Successfully loaded Smack policies.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m");
+ return 0;
+ }
+
+#if HAVE_SMACK_RUN_LABEL
+ r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m");
+ r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m");
+ r = write_string_file("/sys/fs/smackfs/netlabel",
+ "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m");
+ r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m");
+#endif
+
+ r = write_cipso2_rules("/etc/smack/cipso.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack/CIPSO is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found");
+ break;
+ case 0:
+ log_info("Successfully loaded Smack/CIPSO policies.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m");
+ break;
+ }
+
+ r = write_netlabel_rules("/etc/smack/netlabel.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack/CIPSO is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found");
+ break;
+ case 0:
+ log_info("Successfully loaded Smack network host rules.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring.");
+ break;
+ }
+
+ r = write_onlycap_list();
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack is not enabled in the kernel.");
+ break;
+ case ENOENT:
+ log_debug("Smack onlycap list file '/etc/smack/onlycap' not found");
+ break;
+ case 0:
+ log_info("Successfully wrote Smack onlycap list.");
+ break;
+ default:
+ return log_emergency_errno(r, "Failed to write Smack onlycap list: %m");
+ }
+
+ *loaded_policy = true;
+
+#endif
+
+ return 0;
+}
diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h
new file mode 100644
index 0000000..d29370d
--- /dev/null
+++ b/src/core/smack-setup.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2013 Intel Corporation
+ Authors:
+ Nathaniel Chen <nathaniel.chen@intel.com>
+***/
+
+int mac_smack_setup(bool *loaded_policy);
diff --git a/src/core/socket.c b/src/core/socket.c
new file mode 100644
index 0000000..e86e9c8
--- /dev/null
+++ b/src/core/socket.c
@@ -0,0 +1,3544 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <mqueue.h>
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/sctp.h>
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "chase-symlinks.h"
+#include "copy.h"
+#include "dbus-socket.h"
+#include "dbus-unit.h"
+#include "def.h"
+#include "errno-list.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "in-addr-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "label.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+struct SocketPeer {
+ unsigned n_ref;
+
+ Socket *socket;
+ union sockaddr_union peer;
+ socklen_t peer_salen;
+};
+
+static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = {
+ [SOCKET_DEAD] = UNIT_INACTIVE,
+ [SOCKET_START_PRE] = UNIT_ACTIVATING,
+ [SOCKET_START_CHOWN] = UNIT_ACTIVATING,
+ [SOCKET_START_POST] = UNIT_ACTIVATING,
+ [SOCKET_LISTENING] = UNIT_ACTIVE,
+ [SOCKET_RUNNING] = UNIT_ACTIVE,
+ [SOCKET_STOP_PRE] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_POST] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SOCKET_FAILED] = UNIT_FAILED,
+ [SOCKET_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static void flush_ports(Socket *s);
+
+static void socket_init(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->backlog = SOMAXCONN_DELUXE;
+ s->timeout_usec = u->manager->default_timeout_start_usec;
+ s->directory_mode = 0755;
+ s->socket_mode = 0666;
+
+ s->max_connections = 64;
+
+ s->priority = -1;
+ s->ip_tos = -1;
+ s->ip_ttl = -1;
+ s->mark = -1;
+
+ s->exec_context.std_output = u->manager->default_std_output;
+ s->exec_context.std_error = u->manager->default_std_error;
+
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ s->trigger_limit.interval = USEC_INFINITY;
+ s->trigger_limit.burst = UINT_MAX;
+}
+
+static void socket_unwatch_control_pid(Socket *s) {
+ assert(s);
+
+ if (s->control_pid <= 0)
+ return;
+
+ unit_unwatch_pid(UNIT(s), TAKE_PID(s->control_pid));
+}
+
+static void socket_cleanup_fd_list(SocketPort *p) {
+ assert(p);
+
+ close_many(p->auxiliary_fds, p->n_auxiliary_fds);
+ p->auxiliary_fds = mfree(p->auxiliary_fds);
+ p->n_auxiliary_fds = 0;
+}
+
+SocketPort *socket_port_free(SocketPort *p) {
+ if (!p)
+ return NULL;
+
+ sd_event_source_unref(p->event_source);
+
+ socket_cleanup_fd_list(p);
+ safe_close(p->fd);
+ free(p->path);
+
+ return mfree(p);
+}
+
+void socket_free_ports(Socket *s) {
+ SocketPort *p;
+
+ assert(s);
+
+ while ((p = s->ports)) {
+ LIST_REMOVE(port, s->ports, p);
+ socket_port_free(p);
+ }
+}
+
+static void socket_done(Unit *u) {
+ Socket *s = SOCKET(u);
+ SocketPeer *p;
+
+ assert(s);
+
+ socket_free_ports(s);
+
+ while ((p = set_steal_first(s->peers_by_address)))
+ p->socket = NULL;
+
+ s->peers_by_address = set_free(s->peers_by_address);
+
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, false);
+ exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+
+ dynamic_creds_unref(&s->dynamic_creds);
+
+ socket_unwatch_control_pid(s);
+
+ unit_ref_unset(&s->service);
+
+ s->tcp_congestion = mfree(s->tcp_congestion);
+ s->bind_to_device = mfree(s->bind_to_device);
+
+ s->smack = mfree(s->smack);
+ s->smack_ip_in = mfree(s->smack_ip_in);
+ s->smack_ip_out = mfree(s->smack_ip_out);
+
+ strv_free(s->symlinks);
+
+ s->user = mfree(s->user);
+ s->group = mfree(s->group);
+
+ s->fdname = mfree(s->fdname);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int socket_arm_timer(Socket *s, usec_t usec) {
+ int r;
+
+ assert(s);
+
+ if (s->timer_event_source) {
+ r = sd_event_source_set_time(s->timer_event_source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
+ }
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->timer_event_source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ socket_dispatch_timer, s);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s->timer_event_source, "socket-timer");
+
+ return 0;
+}
+
+static bool have_non_accept_socket(Socket *s) {
+ assert(s);
+
+ if (!s->accept)
+ return true;
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ if (p->type != SOCKET_SOCKET)
+ return true;
+
+ if (!socket_address_can_accept(&p->address))
+ return true;
+ }
+
+ return false;
+}
+
+static int socket_add_mount_dependencies(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *path = NULL;
+
+ if (p->type == SOCKET_SOCKET)
+ path = socket_address_get_path(&p->address);
+ else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION))
+ path = p->path;
+
+ if (!path)
+ continue;
+
+ r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int socket_add_device_dependencies(Socket *s) {
+ char *t;
+
+ assert(s);
+
+ if (!s->bind_to_device || streq(s->bind_to_device, "lo"))
+ return 0;
+
+ t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device);
+ return unit_add_node_dependency(UNIT(s), t, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE);
+}
+
+static int socket_add_default_dependencies(Socket *s) {
+ int r;
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+_pure_ static bool socket_has_exec(Socket *s) {
+ unsigned i;
+ assert(s);
+
+ for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++)
+ if (s->exec_command[i])
+ return true;
+
+ return false;
+}
+
+static int socket_add_extras(Socket *s) {
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+
+ /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit
+ * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept()
+ * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly
+ * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic
+ * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to
+ * process whatever is queued in one go, and thus should normally never have to be started frequently. This is
+ * different for Accept=yes where each connection is processed by a new service instance, and thus frequent
+ * service starts are typical. */
+
+ if (s->trigger_limit.interval == USEC_INFINITY)
+ s->trigger_limit.interval = 2 * USEC_PER_SEC;
+
+ if (s->trigger_limit.burst == UINT_MAX) {
+ if (s->accept)
+ s->trigger_limit.burst = 200;
+ else
+ s->trigger_limit.burst = 20;
+ }
+
+ if (have_non_accept_socket(s)) {
+
+ if (!UNIT_DEREF(s->service)) {
+ Unit *x;
+
+ r = unit_load_related_unit(u, ".service", &x);
+ if (r < 0)
+ return r;
+
+ unit_ref_set(&s->service, u, x);
+ }
+
+ r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+ }
+
+ r = socket_add_mount_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = socket_add_device_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ if (socket_has_exec(s)) {
+ r = unit_add_exec_dependencies(u, &s->exec_context);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_set_default_slice(u);
+ if (r < 0)
+ return r;
+
+ r = socket_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static const char *socket_find_symlink_target(Socket *s) {
+ const char *found = NULL;
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *f = NULL;
+
+ switch (p->type) {
+
+ case SOCKET_FIFO:
+ f = p->path;
+ break;
+
+ case SOCKET_SOCKET:
+ f = socket_address_get_path(&p->address);
+ break;
+
+ default:
+ break;
+ }
+
+ if (f) {
+ if (found)
+ return NULL;
+
+ found = f;
+ }
+ }
+
+ return found;
+}
+
+static int socket_verify(Socket *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (!s->ports)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing.");
+
+ if (s->accept && have_non_accept_socket(s))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing.");
+
+ if (s->accept && s->max_connections <= 0)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "MaxConnection= setting too small. Refusing.");
+
+ if (s->accept && UNIT_DEREF(s->service))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Explicit service configuration for accepting socket units not supported. Refusing.");
+
+ if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing.");
+
+ if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has symlinks set but none or more than one node in the file system. Refusing.");
+
+ return 0;
+}
+
+static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) {
+ assert(s);
+
+ if (s->peer.sa.sa_family == AF_INET)
+ siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state);
+ else if (s->peer.sa.sa_family == AF_INET6)
+ siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state);
+ else if (s->peer.sa.sa_family == AF_VSOCK)
+ siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state);
+ else
+ assert_not_reached();
+}
+
+static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) {
+ int r;
+
+ r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family);
+ if (r != 0)
+ return r;
+
+ switch (x->peer.sa.sa_family) {
+ case AF_INET:
+ return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr));
+ case AF_INET6:
+ return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr));
+ case AF_VSOCK:
+ return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid);
+ }
+ assert_not_reached();
+}
+
+DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func);
+
+static int socket_load(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = socket_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return socket_verify(s);
+}
+
+static SocketPeer *socket_peer_new(void) {
+ SocketPeer *p;
+
+ p = new(SocketPeer, 1);
+ if (!p)
+ return NULL;
+
+ *p = (SocketPeer) {
+ .n_ref = 1,
+ };
+ return p;
+}
+
+static SocketPeer *socket_peer_free(SocketPeer *p) {
+ assert(p);
+
+ if (p->socket)
+ set_remove(p->socket->peers_by_address, p);
+
+ return mfree(p);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free);
+
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **p) {
+ _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL;
+ SocketPeer sa = {
+ .peer_salen = sizeof(union sockaddr_union),
+ }, *i;
+ int r;
+
+ assert(fd >= 0);
+ assert(s);
+
+ if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "getpeername failed: %m");
+
+ if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+ *p = NULL;
+ return 0;
+ }
+
+ r = set_ensure_allocated(&s->peers_by_address, &peer_address_hash_ops);
+ if (r < 0)
+ return r;
+
+ i = set_get(s->peers_by_address, &sa);
+ if (i) {
+ *p = socket_peer_ref(i);
+ return 1;
+ }
+
+ remote = socket_peer_new();
+ if (!remote)
+ return log_oom();
+
+ remote->peer = sa.peer;
+ remote->peer_salen = sa.peer_salen;
+
+ r = set_put(s->peers_by_address, remote);
+ if (r < 0)
+ return r;
+
+ remote->socket = s;
+
+ *p = TAKE_PTR(remote);
+ return 1;
+}
+
+_const_ static const char* listen_lookup(int family, int type) {
+
+ if (family == AF_NETLINK)
+ return "ListenNetlink";
+
+ if (type == SOCK_STREAM)
+ return "ListenStream";
+ else if (type == SOCK_DGRAM)
+ return "ListenDatagram";
+ else if (type == SOCK_SEQPACKET)
+ return "ListenSequentialPacket";
+
+ assert_not_reached();
+ return NULL;
+}
+
+static void socket_dump(Unit *u, FILE *f, const char *prefix) {
+ Socket *s = SOCKET(u);
+ const char *prefix2, *str;
+
+ assert(s);
+ assert(f);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%sSocket State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sBindIPv6Only: %s\n"
+ "%sBacklog: %u\n"
+ "%sSocketMode: %04o\n"
+ "%sDirectoryMode: %04o\n"
+ "%sKeepAlive: %s\n"
+ "%sNoDelay: %s\n"
+ "%sFreeBind: %s\n"
+ "%sTransparent: %s\n"
+ "%sBroadcast: %s\n"
+ "%sPassCredentials: %s\n"
+ "%sPassSecurity: %s\n"
+ "%sPassPacketInfo: %s\n"
+ "%sTCPCongestion: %s\n"
+ "%sRemoveOnStop: %s\n"
+ "%sWritable: %s\n"
+ "%sFileDescriptorName: %s\n"
+ "%sSELinuxContextFromNet: %s\n",
+ prefix, socket_state_to_string(s->state),
+ prefix, socket_result_to_string(s->result),
+ prefix, socket_result_to_string(s->clean_result),
+ prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only),
+ prefix, s->backlog,
+ prefix, s->socket_mode,
+ prefix, s->directory_mode,
+ prefix, yes_no(s->keep_alive),
+ prefix, yes_no(s->no_delay),
+ prefix, yes_no(s->free_bind),
+ prefix, yes_no(s->transparent),
+ prefix, yes_no(s->broadcast),
+ prefix, yes_no(s->pass_cred),
+ prefix, yes_no(s->pass_sec),
+ prefix, yes_no(s->pass_pktinfo),
+ prefix, strna(s->tcp_congestion),
+ prefix, yes_no(s->remove_on_stop),
+ prefix, yes_no(s->writable),
+ prefix, socket_fdname(s),
+ prefix, yes_no(s->selinux_context_from_net));
+
+ if (s->timestamping != SOCKET_TIMESTAMPING_OFF)
+ fprintf(f,
+ "%sTimestamping: %s\n",
+ prefix, socket_timestamping_to_string(s->timestamping));
+
+ if (s->control_pid > 0)
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid);
+
+ if (s->bind_to_device)
+ fprintf(f,
+ "%sBindToDevice: %s\n",
+ prefix, s->bind_to_device);
+
+ if (s->accept)
+ fprintf(f,
+ "%sAccepted: %u\n"
+ "%sNConnections: %u\n"
+ "%sMaxConnections: %u\n"
+ "%sMaxConnectionsPerSource: %u\n",
+ prefix, s->n_accepted,
+ prefix, s->n_connections,
+ prefix, s->max_connections,
+ prefix, s->max_connections_per_source);
+ else
+ fprintf(f,
+ "%sFlushPending: %s\n",
+ prefix, yes_no(s->flush_pending));
+
+
+ if (s->priority >= 0)
+ fprintf(f,
+ "%sPriority: %i\n",
+ prefix, s->priority);
+
+ if (s->receive_buffer > 0)
+ fprintf(f,
+ "%sReceiveBuffer: %zu\n",
+ prefix, s->receive_buffer);
+
+ if (s->send_buffer > 0)
+ fprintf(f,
+ "%sSendBuffer: %zu\n",
+ prefix, s->send_buffer);
+
+ if (s->ip_tos >= 0)
+ fprintf(f,
+ "%sIPTOS: %i\n",
+ prefix, s->ip_tos);
+
+ if (s->ip_ttl >= 0)
+ fprintf(f,
+ "%sIPTTL: %i\n",
+ prefix, s->ip_ttl);
+
+ if (s->pipe_size > 0)
+ fprintf(f,
+ "%sPipeSize: %zu\n",
+ prefix, s->pipe_size);
+
+ if (s->mark >= 0)
+ fprintf(f,
+ "%sMark: %i\n",
+ prefix, s->mark);
+
+ if (s->mq_maxmsg > 0)
+ fprintf(f,
+ "%sMessageQueueMaxMessages: %li\n",
+ prefix, s->mq_maxmsg);
+
+ if (s->mq_msgsize > 0)
+ fprintf(f,
+ "%sMessageQueueMessageSize: %li\n",
+ prefix, s->mq_msgsize);
+
+ if (s->reuse_port)
+ fprintf(f,
+ "%sReusePort: %s\n",
+ prefix, yes_no(s->reuse_port));
+
+ if (s->smack)
+ fprintf(f,
+ "%sSmackLabel: %s\n",
+ prefix, s->smack);
+
+ if (s->smack_ip_in)
+ fprintf(f,
+ "%sSmackLabelIPIn: %s\n",
+ prefix, s->smack_ip_in);
+
+ if (s->smack_ip_out)
+ fprintf(f,
+ "%sSmackLabelIPOut: %s\n",
+ prefix, s->smack_ip_out);
+
+ if (!isempty(s->user) || !isempty(s->group))
+ fprintf(f,
+ "%sSocketUser: %s\n"
+ "%sSocketGroup: %s\n",
+ prefix, strna(s->user),
+ prefix, strna(s->group));
+
+ if (s->keep_alive_time > 0)
+ fprintf(f,
+ "%sKeepAliveTimeSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->keep_alive_time, USEC_PER_SEC));
+
+ if (s->keep_alive_interval > 0)
+ fprintf(f,
+ "%sKeepAliveIntervalSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->keep_alive_interval, USEC_PER_SEC));
+
+ if (s->keep_alive_cnt > 0)
+ fprintf(f,
+ "%sKeepAliveProbes: %u\n",
+ prefix, s->keep_alive_cnt);
+
+ if (s->defer_accept > 0)
+ fprintf(f,
+ "%sDeferAcceptSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->defer_accept, USEC_PER_SEC));
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ switch (p->type) {
+ case SOCKET_SOCKET: {
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ r = socket_address_print(&p->address, &k);
+ if (r < 0) {
+ errno = -r;
+ fprintf(f, "%s%s: %m\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type));
+ } else
+ fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), k);
+ break;
+ }
+ case SOCKET_SPECIAL:
+ fprintf(f, "%sListenSpecial: %s\n", prefix, p->path);
+ break;
+ case SOCKET_USB_FUNCTION:
+ fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path);
+ break;
+ case SOCKET_MQUEUE:
+ fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path);
+ break;
+ default:
+ fprintf(f, "%sListenFIFO: %s\n", prefix, p->path);
+ }
+ }
+
+ fprintf(f,
+ "%sTriggerLimitIntervalSec: %s\n"
+ "%sTriggerLimitBurst: %u\n",
+ prefix, FORMAT_TIMESPAN(s->trigger_limit.interval, USEC_PER_SEC),
+ prefix, s->trigger_limit.burst);
+
+ str = ip_protocol_to_name(s->socket_protocol);
+ if (str)
+ fprintf(f, "%sSocketProtocol: %s\n", prefix, str);
+
+ if (!strv_isempty(s->symlinks)) {
+ fprintf(f, "%sSymlinks:", prefix);
+ STRV_FOREACH(q, s->symlinks)
+ fprintf(f, " %s", *q);
+
+ fprintf(f, "\n");
+ }
+
+ fprintf(f,
+ "%sTimeoutSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+ exec_context_dump(&s->exec_context, f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+
+ for (SocketExecCommand c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) {
+ if (!s->exec_command[c])
+ continue;
+
+ fprintf(f, "%s-> %s:\n",
+ prefix, socket_exec_command_to_string(c));
+
+ exec_command_dump_list(s->exec_command[c], f, prefix2);
+ }
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int instance_from_socket(int fd, unsigned nr, char **instance) {
+ socklen_t l;
+ char *r;
+ union sockaddr_union local, remote;
+
+ assert(fd >= 0);
+ assert(instance);
+
+ l = sizeof(local);
+ if (getsockname(fd, &local.sa, &l) < 0)
+ return -errno;
+
+ l = sizeof(remote);
+ if (getpeername(fd, &remote.sa, &l) < 0)
+ return -errno;
+
+ switch (local.sa.sa_family) {
+
+ case AF_INET: {
+ uint32_t
+ a = be32toh(local.in.sin_addr.s_addr),
+ b = be32toh(remote.in.sin_addr.s_addr);
+
+ if (asprintf(&r,
+ "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+ nr,
+ a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF,
+ be16toh(local.in.sin_port),
+ b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF,
+ be16toh(remote.in.sin_port)) < 0)
+ return -ENOMEM;
+
+ break;
+ }
+
+ case AF_INET6: {
+ static const unsigned char ipv4_prefix[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF
+ };
+
+ if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 &&
+ memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) {
+ const uint8_t
+ *a = local.in6.sin6_addr.s6_addr+12,
+ *b = remote.in6.sin6_addr.s6_addr+12;
+
+ if (asprintf(&r,
+ "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+ nr,
+ a[0], a[1], a[2], a[3],
+ be16toh(local.in6.sin6_port),
+ b[0], b[1], b[2], b[3],
+ be16toh(remote.in6.sin6_port)) < 0)
+ return -ENOMEM;
+ } else {
+ if (asprintf(&r,
+ "%u-%s:%u-%s:%u",
+ nr,
+ IN6_ADDR_TO_STRING(&local.in6.sin6_addr),
+ be16toh(local.in6.sin6_port),
+ IN6_ADDR_TO_STRING(&remote.in6.sin6_addr),
+ be16toh(remote.in6.sin6_port)) < 0)
+ return -ENOMEM;
+ }
+
+ break;
+ }
+
+ case AF_UNIX: {
+ struct ucred ucred;
+ int k;
+
+ k = getpeercred(fd, &ucred);
+ if (k >= 0) {
+ if (asprintf(&r,
+ "%u-"PID_FMT"-"UID_FMT,
+ nr, ucred.pid, ucred.uid) < 0)
+ return -ENOMEM;
+ } else if (k == -ENODATA) {
+ /* This handles the case where somebody is
+ * connecting from another pid/uid namespace
+ * (e.g. from outside of our container). */
+ if (asprintf(&r,
+ "%u-unknown",
+ nr) < 0)
+ return -ENOMEM;
+ } else
+ return k;
+
+ break;
+ }
+
+ case AF_VSOCK:
+ if (asprintf(&r,
+ "%u-%u:%u-%u:%u",
+ nr,
+ local.vm.svm_cid, local.vm.svm_port,
+ remote.vm.svm_cid, remote.vm.svm_port) < 0)
+ return -ENOMEM;
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ *instance = r;
+ return 0;
+}
+
+static void socket_close_fds(Socket *s) {
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ bool was_open;
+
+ was_open = p->fd >= 0;
+
+ p->event_source = sd_event_source_disable_unref(p->event_source);
+ p->fd = safe_close(p->fd);
+ socket_cleanup_fd_list(p);
+
+ /* One little note: we should normally not delete any sockets in the file system here! After all some
+ * other process we spawned might still have a reference of this fd and wants to continue to use
+ * it. Therefore we normally delete sockets in the file system before we create a new one, not after we
+ * stopped using one! That all said, if the user explicitly requested this, we'll delete them here
+ * anyway, but only then. */
+
+ if (!was_open || !s->remove_on_stop)
+ continue;
+
+ switch (p->type) {
+
+ case SOCKET_FIFO:
+ (void) unlink(p->path);
+ break;
+
+ case SOCKET_MQUEUE:
+ (void) mq_unlink(p->path);
+ break;
+
+ case SOCKET_SOCKET:
+ (void) socket_address_unlink(&p->address);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (s->remove_on_stop)
+ STRV_FOREACH(i, s->symlinks)
+ (void) unlink(*i);
+
+ /* Note that we don't return NULL here, since s has not been freed. */
+}
+
+static void socket_apply_socket_options(Socket *s, SocketPort *p, int fd) {
+ int r;
+
+ assert(s);
+ assert(p);
+ assert(fd >= 0);
+
+ if (s->keep_alive) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m");
+ }
+
+ if (s->keep_alive_time > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m");
+ }
+
+ if (s->keep_alive_interval > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m");
+ }
+
+ if (s->keep_alive_cnt > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m");
+ }
+
+ if (s->defer_accept > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m");
+ }
+
+ if (s->no_delay) {
+ if (s->socket_protocol == IPPROTO_SCTP) {
+ r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m");
+ } else {
+ r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m");
+ }
+ }
+
+ if (s->broadcast) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m");
+ }
+
+ if (s->pass_cred) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m");
+ }
+
+ if (s->pass_sec) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m");
+ }
+
+ if (s->pass_pktinfo) {
+ r = socket_set_recvpktinfo(fd, socket_address_family(&p->address), true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to enable packet info socket option: %m");
+ }
+
+ if (s->timestamping != SOCKET_TIMESTAMPING_OFF) {
+ r = setsockopt_int(fd, SOL_SOCKET,
+ s->timestamping == SOCKET_TIMESTAMPING_NS ? SO_TIMESTAMPNS : SO_TIMESTAMP,
+ true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to enable timestamping socket option, ignoring: %m");
+ }
+
+ if (s->priority >= 0) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m");
+ }
+
+ if (s->receive_buffer > 0) {
+ r = fd_set_rcvbuf(fd, s->receive_buffer, false);
+ if (r < 0)
+ log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "SO_RCVBUF/SO_RCVBUFFORCE failed: %m");
+ }
+
+ if (s->send_buffer > 0) {
+ r = fd_set_sndbuf(fd, s->send_buffer, false);
+ if (r < 0)
+ log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "SO_SNDBUF/SO_SNDBUFFORCE failed: %m");
+ }
+
+ if (s->mark >= 0) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m");
+ }
+
+ if (s->ip_tos >= 0) {
+ r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m");
+ }
+
+ if (s->ip_ttl >= 0) {
+ r = socket_set_ttl(fd, socket_address_family(&p->address), s->ip_ttl);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m");
+ }
+
+ if (s->tcp_congestion)
+ if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0)
+ log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m");
+
+ if (s->smack_ip_in) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m");
+ }
+
+ if (s->smack_ip_out) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m");
+ }
+}
+
+static void socket_apply_fifo_options(Socket *s, int fd) {
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ if (s->pipe_size > 0)
+ if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0)
+ log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m");
+
+ if (s->smack) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m");
+ }
+}
+
+static int fifo_address_create(
+ const char *path,
+ mode_t directory_mode,
+ mode_t socket_mode) {
+
+ _cleanup_close_ int fd = -1;
+ mode_t old_mask;
+ struct stat st;
+ int r;
+
+ assert(path);
+
+ (void) mkdir_parents_label(path, directory_mode);
+
+ r = mac_selinux_create_file_prepare(path, S_IFIFO);
+ if (r < 0)
+ return r;
+
+ /* Enforce the right access mode for the fifo */
+ old_mask = umask(~socket_mode);
+
+ /* Include the original umask in our mask */
+ (void) umask(~socket_mode | old_mask);
+
+ r = mkfifo(path, socket_mode);
+ (void) umask(old_mask);
+
+ if (r < 0 && errno != EEXIST) {
+ r = -errno;
+ goto fail;
+ }
+
+ fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW);
+ if (fd < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ mac_selinux_create_file_clear();
+
+ if (fstat(fd, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (!S_ISFIFO(st.st_mode) ||
+ (st.st_mode & 0777) != (socket_mode & ~old_mask) ||
+ st.st_uid != getuid() ||
+ st.st_gid != getgid()) {
+ r = -EEXIST;
+ goto fail;
+ }
+
+ return TAKE_FD(fd);
+
+fail:
+ mac_selinux_create_file_clear();
+ return r;
+}
+
+static int special_address_create(const char *path, bool writable) {
+ _cleanup_close_ int fd = -1;
+ struct stat st;
+
+ assert(path);
+
+ fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Check whether this is a /proc, /sys or /dev file or char device */
+ if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode))
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int usbffs_address_create(const char *path) {
+ _cleanup_close_ int fd = -1;
+ struct stat st;
+
+ assert(path);
+
+ fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Check whether this is a regular file (ffs endpoint) */
+ if (!S_ISREG(st.st_mode))
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int mq_address_create(
+ const char *path,
+ mode_t mq_mode,
+ long maxmsg,
+ long msgsize) {
+
+ _cleanup_close_ int fd = -1;
+ struct stat st;
+ mode_t old_mask;
+ struct mq_attr _attr, *attr = NULL;
+
+ assert(path);
+
+ if (maxmsg > 0 && msgsize > 0) {
+ _attr = (struct mq_attr) {
+ .mq_flags = O_NONBLOCK,
+ .mq_maxmsg = maxmsg,
+ .mq_msgsize = msgsize,
+ };
+ attr = &_attr;
+ }
+
+ /* Enforce the right access mode for the mq */
+ old_mask = umask(~mq_mode);
+
+ /* Include the original umask in our mask */
+ (void) umask(~mq_mode | old_mask);
+ fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr);
+ (void) umask(old_mask);
+
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if ((st.st_mode & 0777) != (mq_mode & ~old_mask) ||
+ st.st_uid != getuid() ||
+ st.st_gid != getgid())
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int socket_symlink(Socket *s) {
+ const char *p;
+ int r;
+
+ assert(s);
+
+ p = socket_find_symlink_target(s);
+ if (!p)
+ return 0;
+
+ STRV_FOREACH(i, s->symlinks) {
+ (void) mkdir_parents_label(*i, s->directory_mode);
+
+ r = symlink_idempotent(p, *i, false);
+
+ if (r == -EEXIST && s->remove_on_stop) {
+ /* If there's already something where we want to create the symlink, and the destructive
+ * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try
+ * again. */
+
+ if (unlink(*i) >= 0)
+ r = symlink_idempotent(p, *i, false);
+ }
+
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s %s %s, ignoring: %m",
+ p, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), *i);
+ }
+
+ return 0;
+}
+
+static int usbffs_write_descs(int fd, Service *s) {
+ int r;
+
+ if (!s->usb_function_descriptors || !s->usb_function_strings)
+ return -EINVAL;
+
+ r = copy_file_fd(s->usb_function_descriptors, fd, 0);
+ if (r < 0)
+ return r;
+
+ return copy_file_fd(s->usb_function_strings, fd, 0);
+}
+
+static int usbffs_select_ep(const struct dirent *d) {
+ return d->d_name[0] != '.' && !streq(d->d_name, "ep0");
+}
+
+static int usbffs_dispatch_eps(SocketPort *p) {
+ _cleanup_free_ struct dirent **ent = NULL;
+ size_t n, k;
+ int r;
+
+ r = scandir(p->path, &ent, usbffs_select_ep, alphasort);
+ if (r < 0)
+ return -errno;
+
+ n = (size_t) r;
+ p->auxiliary_fds = new(int, n);
+ if (!p->auxiliary_fds) {
+ r = -ENOMEM;
+ goto clear;
+ }
+
+ p->n_auxiliary_fds = n;
+
+ k = 0;
+ for (size_t i = 0; i < n; ++i) {
+ _cleanup_free_ char *ep = NULL;
+
+ ep = path_make_absolute(ent[i]->d_name, p->path);
+ if (!ep) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ path_simplify(ep);
+
+ r = usbffs_address_create(ep);
+ if (r < 0)
+ goto fail;
+
+ p->auxiliary_fds[k++] = r;
+ }
+
+ r = 0;
+ goto clear;
+
+fail:
+ close_many(p->auxiliary_fds, k);
+ p->auxiliary_fds = mfree(p->auxiliary_fds);
+ p->n_auxiliary_fds = 0;
+
+clear:
+ for (size_t i = 0; i < n; ++i)
+ free(ent[i]);
+
+ return r;
+}
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
+ /* Figure out what the unit that will be used to handle the connections on the socket looks like.
+ *
+ * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake
+ * instance name.
+ */
+
+ if (UNIT_ISSET(s->service)) {
+ *ret = UNIT_DEREF(s->service);
+ return 0;
+ }
+
+ if (!s->accept)
+ return -ENODATA;
+
+ /* Build the instance name and load the unit */
+ _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL;
+ int r;
+
+ r = unit_name_to_prefix(UNIT(s)->id, &prefix);
+ if (r < 0)
+ return r;
+
+ if (cfd >= 0) {
+ r = instance_from_socket(cfd, s->n_accepted, &instance);
+ if (ERRNO_IS_DISCONNECT(r))
+ /* ENOTCONN is legitimate if TCP RST was received. Other socket families might return
+ * different errors. This connection is over, but the socket unit lives on. */
+ return log_unit_debug_errno(UNIT(s), r,
+ "Got %s on incoming socket, assuming aborted connection attempt, ignoring.",
+ errno_to_name(r));
+ if (r < 0)
+ return r;
+ }
+
+ /* For accepting sockets, we don't know how the instance will be called until we get a connection and
+ * can figure out what the peer name is. So let's use "internal" as the instance to make it clear
+ * that this is not an actual peer name. We use "unknown" when we cannot figure out the peer. */
+ r = unit_name_build(prefix, instance ?: "internal", ".service", &name);
+ if (r < 0)
+ return r;
+
+ return manager_load_unit(UNIT(s)->manager, name, NULL, NULL, ret);
+}
+
+static int socket_determine_selinux_label(Socket *s, char **ret) {
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ Unit *service;
+ ExecCommand *c;
+ const char *exec_context;
+ _cleanup_free_ char *path = NULL;
+
+ r = socket_load_service_unit(s, -1, &service);
+ if (r == -ENODATA)
+ goto no_label;
+ if (r < 0)
+ return r;
+
+ exec_context = SERVICE(service)->exec_context.selinux_context;
+ if (exec_context) {
+ char *con;
+
+ con = strdup(exec_context);
+ if (!con)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(con);
+ return 0;
+ }
+
+ c = SERVICE(service)->exec_command[SERVICE_EXEC_START];
+ if (!c)
+ goto no_label;
+
+ r = chase_symlinks(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL);
+ if (r < 0)
+ goto no_label;
+
+ r = mac_selinux_get_create_label_from_exe(path, ret);
+ if (IN_SET(r, -EPERM, -EOPNOTSUPP))
+ goto no_label;
+ return r;
+
+no_label:
+ *ret = NULL;
+ return 0;
+}
+
+static int socket_address_listen_do(
+ Socket *s,
+ const SocketAddress *address,
+ const char *label) {
+
+ assert(s);
+ assert(address);
+
+ return socket_address_listen(
+ address,
+ SOCK_CLOEXEC|SOCK_NONBLOCK,
+ s->backlog,
+ s->bind_ipv6_only,
+ s->bind_to_device,
+ s->reuse_port,
+ s->free_bind,
+ s->transparent,
+ s->directory_mode,
+ s->socket_mode,
+ label);
+}
+
+#define log_address_error_errno(u, address, error, fmt) \
+ ({ \
+ _cleanup_free_ char *_t = NULL; \
+ \
+ (void) socket_address_print(address, &_t); \
+ log_unit_error_errno(u, error, fmt, strna(_t)); \
+ })
+
+static int fork_needed(const SocketAddress *address, const ExecContext *context) {
+ int r;
+
+ assert(address);
+ assert(context);
+
+ /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */
+
+ if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) {
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */
+ return true;
+ }
+
+ return context->private_network || context->network_namespace_path;
+}
+
+static int socket_address_listen_in_cgroup(
+ Socket *s,
+ const SocketAddress *address,
+ const char *label) {
+
+ _cleanup_close_pair_ int pair[2] = { -1, -1 };
+ int fd, r;
+ pid_t pid;
+
+ assert(s);
+ assert(address);
+
+ /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the
+ * socket's cgroup and network namespace in which the socket is actually created. This way we ensure
+ * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and
+ * such. */
+
+ r = fork_needed(address, &s->exec_context);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Shortcut things... */
+ fd = socket_address_listen_do(s, address, label);
+ if (fd < 0)
+ return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+
+ return fd;
+ }
+
+ r = unit_setup_exec_runtime(UNIT(s));
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m");
+
+ if (s->exec_context.network_namespace_path &&
+ s->exec_runtime &&
+ s->exec_runtime->netns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(s->exec_runtime->netns_storage_socket, s->exec_context.network_namespace_path, CLONE_NEWNET);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path);
+ }
+
+ if (s->exec_context.ipc_namespace_path &&
+ s->exec_runtime &&
+ s->exec_runtime->ipcns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(s->exec_runtime->ipcns_storage_socket, s->exec_context.ipc_namespace_path, CLONE_NEWIPC);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to open IPC namespace path %s: %m", s->exec_context.ipc_namespace_path);
+ }
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m");
+ if (r == 0) {
+ /* Child */
+
+ pair[0] = safe_close(pair[0]);
+
+ if ((s->exec_context.private_network || s->exec_context.network_namespace_path) &&
+ s->exec_runtime &&
+ s->exec_runtime->netns_storage_socket[0] >= 0) {
+
+ if (ns_type_supported(NAMESPACE_NET)) {
+ r = setup_shareable_ns(s->exec_runtime->netns_storage_socket, CLONE_NEWNET);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m");
+ _exit(EXIT_NETWORK);
+ }
+ } else if (s->exec_context.network_namespace_path) {
+ log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported.");
+ _exit(EXIT_NETWORK);
+ } else
+ log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
+ }
+
+ fd = socket_address_listen_do(s, address, label);
+ if (fd < 0) {
+ log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = send_one_fd(pair[1], fd, 0);
+ if (r < 0) {
+ log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pair[1] = safe_close(pair[1]);
+ fd = receive_one_fd(pair[0], 0);
+
+ /* We synchronously wait for the helper, as it shouldn't be slow */
+ r = wait_for_terminate_and_check("(sd-listen)", pid, WAIT_LOG_ABNORMAL);
+ if (r < 0) {
+ safe_close(fd);
+ return r;
+ }
+
+ if (fd < 0)
+ return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m");
+
+ return fd;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Socket *, socket_close_fds, NULL);
+
+static int socket_open_fds(Socket *orig_s) {
+ _cleanup_(socket_close_fdsp) Socket *s = orig_s;
+ _cleanup_(mac_selinux_freep) char *label = NULL;
+ bool know_label = false;
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ if (p->fd >= 0)
+ continue;
+
+ switch (p->type) {
+
+ case SOCKET_SOCKET:
+
+ if (!know_label) {
+ /* Figure out the label, if we don't it know yet. We do it once for the first
+ * socket where we need this and remember it for the rest. */
+
+ r = socket_determine_selinux_label(s, &label);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m");
+
+ know_label = true;
+ }
+
+ /* Apply the socket protocol */
+ switch (p->address.type) {
+
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ if (s->socket_protocol == IPPROTO_SCTP)
+ p->address.protocol = s->socket_protocol;
+ break;
+
+ case SOCK_DGRAM:
+ if (s->socket_protocol == IPPROTO_UDPLITE)
+ p->address.protocol = s->socket_protocol;
+ break;
+ }
+
+ p->fd = socket_address_listen_in_cgroup(s, &p->address, label);
+ if (p->fd < 0)
+ return p->fd;
+
+ socket_apply_socket_options(s, p, p->fd);
+ socket_symlink(s);
+ break;
+
+ case SOCKET_SPECIAL:
+
+ p->fd = special_address_create(p->path, s->writable);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path);
+ break;
+
+ case SOCKET_FIFO:
+
+ p->fd = fifo_address_create(
+ p->path,
+ s->directory_mode,
+ s->socket_mode);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path);
+
+ socket_apply_fifo_options(s, p->fd);
+ socket_symlink(s);
+ break;
+
+ case SOCKET_MQUEUE:
+
+ p->fd = mq_address_create(
+ p->path,
+ s->socket_mode,
+ s->mq_maxmsg,
+ s->mq_msgsize);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path);
+ break;
+
+ case SOCKET_USB_FUNCTION: {
+ _cleanup_free_ char *ep = NULL;
+
+ ep = path_make_absolute("ep0", p->path);
+ if (!ep)
+ return -ENOMEM;
+
+ p->fd = usbffs_address_create(ep);
+ if (p->fd < 0)
+ return p->fd;
+
+ r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service)));
+ if (r < 0)
+ return r;
+
+ r = usbffs_dispatch_eps(p);
+ if (r < 0)
+ return r;
+
+ break;
+ }
+ default:
+ assert_not_reached();
+ }
+ }
+
+ s = NULL;
+ return 0;
+}
+
+static void socket_unwatch_fds(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ if (!p->event_source)
+ continue;
+
+ r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF);
+ if (r < 0)
+ log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m");
+ }
+}
+
+static int socket_watch_fds(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ if (p->event_source) {
+ r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON);
+ if (r < 0)
+ goto fail;
+ } else {
+ r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(p->event_source, "socket-port-io");
+ }
+ }
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m");
+ socket_unwatch_fds(s);
+ return r;
+}
+
+enum {
+ SOCKET_OPEN_NONE,
+ SOCKET_OPEN_SOME,
+ SOCKET_OPEN_ALL,
+};
+
+static int socket_check_open(Socket *s) {
+ bool have_open = false, have_closed = false;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ have_closed = true;
+ else
+ have_open = true;
+
+ if (have_open && have_closed)
+ return SOCKET_OPEN_SOME;
+ }
+
+ if (have_open)
+ return SOCKET_OPEN_ALL;
+
+ return SOCKET_OPEN_NONE;
+}
+
+static void socket_set_state(Socket *s, SocketState state) {
+ SocketState old_state;
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!IN_SET(state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING)) {
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ socket_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != SOCKET_LISTENING)
+ socket_unwatch_fds(s);
+
+ if (!IN_SET(state,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_LISTENING,
+ SOCKET_RUNNING,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_CLEANING))
+ socket_close_fds(s);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static int socket_coldplug(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SOCKET_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ if (s->control_pid > 0 &&
+ pid_is_unwaited(s->control_pid) &&
+ IN_SET(s->deserialized_state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING)) {
+
+ r = unit_watch_pid(UNIT(s), s->control_pid, false);
+ if (r < 0)
+ return r;
+
+ r = socket_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (IN_SET(s->deserialized_state,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_LISTENING,
+ SOCKET_RUNNING)) {
+
+ /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors
+ * for. However, this is problematic, as we won't traverse through the SOCKET_START_CHOWN state for
+ * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open,
+ * and if there's a mismatch, warn loudly. */
+
+ r = socket_check_open(s);
+ if (r == SOCKET_OPEN_NONE)
+ log_unit_warning(UNIT(s),
+ "Socket unit configuration has changed while unit has been running, "
+ "no open socket file descriptor left. "
+ "The socket unit is not functional until restarted.");
+ else if (r == SOCKET_OPEN_SOME)
+ log_unit_warning(UNIT(s),
+ "Socket unit configuration has changed while unit has been running, "
+ "and some socket file descriptors have not been opened yet. "
+ "The socket unit is not fully functional until restarted.");
+ }
+
+ if (s->deserialized_state == SOCKET_LISTENING) {
+ r = socket_watch_fds(s);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED, SOCKET_CLEANING)) {
+ (void) unit_setup_dynamic_creds(u);
+ (void) unit_setup_exec_runtime(u);
+ }
+
+ socket_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
+
+ _cleanup_(exec_params_clear) ExecParameters exec_params = {
+ .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
+ .stdin_fd = -1,
+ .stdout_fd = -1,
+ .stderr_fd = -1,
+ .exec_fd = -1,
+ };
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(c);
+ assert(_pid);
+
+ r = unit_prepare_exec(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
+ if (r < 0)
+ return r;
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ return r;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->dynamic_creds,
+ &pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pid(UNIT(s), pid, true);
+ if (r < 0)
+ return r;
+
+ *_pid = pid;
+
+ return 0;
+}
+
+static int socket_chown(Socket *s, pid_t *_pid) {
+ pid_t pid;
+ int r;
+
+ r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
+ if (r < 0)
+ goto fail;
+
+ /* We have to resolve the user names out-of-process, hence
+ * let's fork here. It's messy, but well, what can we do? */
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+
+ /* Child */
+
+ if (!isempty(s->user)) {
+ const char *user = s->user;
+
+ r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user);
+ _exit(EXIT_USER);
+ }
+ }
+
+ if (!isempty(s->group)) {
+ const char *group = s->group;
+
+ r = get_group_creds(&group, &gid, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group);
+ _exit(EXIT_GROUP);
+ }
+ }
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *path = NULL;
+
+ if (p->type == SOCKET_SOCKET)
+ path = socket_address_get_path(&p->address);
+ else if (p->type == SOCKET_FIFO)
+ path = p->path;
+
+ if (!path)
+ continue;
+
+ if (chown(path, uid, gid) < 0) {
+ log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m");
+ _exit(EXIT_CHOWN);
+ }
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ r = unit_watch_pid(UNIT(s), pid, true);
+ if (r < 0)
+ goto fail;
+
+ *_pid = pid;
+ return 0;
+
+fail:
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static void socket_enter_dead(Socket *s, SocketResult f) {
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ if (s->result == SOCKET_SUCCESS)
+ unit_log_success(UNIT(s));
+ else
+ unit_log_failure(UNIT(s), socket_result_to_string(s->result));
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+ socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD);
+
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, true);
+
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ unit_unref_uid_gid(UNIT(s), true);
+
+ dynamic_creds_destroy(&s->dynamic_creds);
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f);
+
+static void socket_enter_stop_post(Socket *s, SocketResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_STOP_POST;
+ s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST];
+
+ if (s->control_command) {
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ socket_set_state(s, SOCKET_STOP_POST);
+ } else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-post' task: %m");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+}
+
+static int state_to_kill_operation(Socket *s, SocketState state) {
+ if (state == SOCKET_STOP_PRE_SIGTERM && unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+
+ if (state == SOCKET_FINAL_SIGTERM)
+ return KILL_TERMINATE;
+
+ return KILL_KILL;
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ state_to_kill_operation(s, state),
+ -1,
+ s->control_pid,
+ false);
+ if (r < 0)
+ goto fail;
+
+ if (r > 0) {
+ r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
+ if (r < 0)
+ goto fail;
+
+ socket_set_state(s, state);
+ } else if (state == SOCKET_STOP_PRE_SIGTERM)
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS);
+ else if (state == SOCKET_STOP_PRE_SIGKILL)
+ socket_enter_stop_post(s, SOCKET_SUCCESS);
+ else if (state == SOCKET_FINAL_SIGTERM)
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+ else
+ socket_enter_dead(s, SOCKET_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+
+ if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL))
+ socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+ else
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_stop_pre(Socket *s, SocketResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_STOP_PRE;
+ s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE];
+
+ if (s->control_command) {
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ socket_set_state(s, SOCKET_STOP_PRE);
+ } else
+ socket_enter_stop_post(s, SOCKET_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'stop-pre' task: %m");
+ socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_listening(Socket *s) {
+ int r;
+ assert(s);
+
+ if (!s->accept && s->flush_pending) {
+ log_unit_debug(UNIT(s), "Flushing socket before listening.");
+ flush_ports(s);
+ }
+
+ r = socket_watch_fds(s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_LISTENING);
+ return;
+
+fail:
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_start_post(Socket *s) {
+ int r;
+ assert(s);
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_START_POST;
+ s->control_command = s->exec_command[SOCKET_EXEC_START_POST];
+
+ if (s->control_command) {
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-post' task: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_START_POST);
+ } else
+ socket_enter_listening(s);
+
+ return;
+
+fail:
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_start_chown(Socket *s) {
+ int r;
+
+ assert(s);
+
+ r = socket_open_fds(s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m");
+ goto fail;
+ }
+
+ if (!isempty(s->user) || !isempty(s->group)) {
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_START_CHOWN;
+ s->control_command = NULL;
+
+ r = socket_chown(s, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to fork 'start-chown' task: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_START_CHOWN);
+ } else
+ socket_enter_start_post(s);
+
+ return;
+
+fail:
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_start_pre(Socket *s) {
+ int r;
+ assert(s);
+
+ socket_unwatch_control_pid(s);
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+ s->control_command_id = SOCKET_EXEC_START_PRE;
+ s->control_command = s->exec_command[SOCKET_EXEC_START_PRE];
+
+ if (s->control_command) {
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'start-pre' task: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_START_PRE);
+ } else
+ socket_enter_start_chown(s);
+
+ return;
+
+fail:
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void flush_ports(Socket *s) {
+ assert(s);
+
+ /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy
+ * anymore */
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ (void) flush_accept(p->fd);
+ (void) flush_fd(p->fd);
+ }
+}
+
+static void socket_enter_running(Socket *s, int cfd_in) {
+ /* Note that this call takes possession of the connection fd passed. It either has to assign it
+ * somewhere or close it. */
+ _cleanup_close_ int cfd = cfd_in;
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ /* We don't take connections anymore if we are supposed to shut down anyway */
+ if (unit_stop_pending(UNIT(s))) {
+
+ log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled.");
+
+ if (cfd >= 0)
+ goto refuse;
+
+ flush_ports(s);
+ return;
+ }
+
+ if (!ratelimit_below(&s->trigger_limit)) {
+ log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation.");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT);
+ goto refuse;
+ }
+
+ if (cfd < 0) {
+ bool pending = false;
+ Unit *other;
+
+ /* If there's already a start pending don't bother to do anything */
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_TRIGGERS)
+ if (unit_active_or_pending(other)) {
+ pending = true;
+ break;
+ }
+
+ if (!pending) {
+ if (!UNIT_ISSET(s->service)) {
+ r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT),
+ "Service to activate vanished, refusing activation.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_RUNNING);
+ } else {
+ _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL;
+ Unit *service;
+
+ if (s->n_connections >= s->max_connections) {
+ log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.",
+ s->n_connections);
+ goto refuse;
+ }
+
+ if (s->max_connections_per_source > 0) {
+ r = socket_acquire_peer(s, cfd, &p);
+ if (r < 0) {
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+ /* We didn't have enough resources to acquire peer information, let's fail. */
+ goto fail;
+ }
+ if (r > 0 && p->n_ref > s->max_connections_per_source) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t);
+
+ log_unit_warning(UNIT(s),
+ "Too many incoming connections (%u) from source %s, dropping connection.",
+ p->n_ref, strnull(t));
+ goto refuse;
+ }
+ }
+
+ r = socket_load_service_unit(s, cfd, &service);
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service,
+ false, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ goto fail;
+
+ s->n_accepted++;
+
+ r = service_set_socket_fd(SERVICE(service), cfd, s, p, s->selinux_context_from_net);
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+ if (r < 0)
+ goto fail;
+
+ TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */
+ s->n_connections++;
+
+ r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ /* We failed to activate the new service, but it still exists. Let's make sure the
+ * service closes and forgets the connection fd again, immediately. */
+ service_close_socket_fd(SERVICE(service));
+ goto fail;
+ }
+
+ /* Notify clients about changed counters */
+ unit_add_to_dbus_queue(UNIT(s));
+ }
+
+ TAKE_FD(cfd);
+ return;
+
+refuse:
+ s->n_refused++;
+ return;
+
+fail:
+ if (ERRNO_IS_RESOURCE(r))
+ log_unit_warning(UNIT(s), "Failed to queue service startup job: %s",
+ bus_error_message(&error, r));
+ else
+ log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s",
+ cfd >= 0 ? "template" : "non-template",
+ bus_error_message(&error, r));
+
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_run_next(Socket *s) {
+ int r;
+
+ assert(s);
+ assert(s->control_command);
+ assert(s->control_command->command_next);
+
+ socket_unwatch_control_pid(s);
+
+ s->control_command = s->control_command->command_next;
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run next task: %m");
+
+ if (s->state == SOCKET_START_POST)
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+ else if (s->state == SOCKET_STOP_POST)
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+ else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+}
+
+static int socket_start(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(s->state,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(s->state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST))
+ return 0;
+
+ /* Cannot run this without the service being around */
+ if (UNIT_ISSET(s->service)) {
+ Service *service;
+
+ service = SERVICE(UNIT_DEREF(s->service));
+
+ if (UNIT(service)->load_state != UNIT_LOADED)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT), "Socket service %s not loaded, refusing.", UNIT(service)->id);
+
+ /* If the service is already active we cannot start the
+ * socket */
+ if (!IN_SET(service->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY), "Socket service %s already active, refusing.", UNIT(service)->id);
+ }
+
+ assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ s->result = SOCKET_SUCCESS;
+ exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+
+ u->reset_accounting = true;
+
+ socket_enter_start_pre(s);
+ return 1;
+}
+
+static int socket_stop(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ /* Already on it */
+ if (IN_SET(s->state,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL))
+ return 0;
+
+ /* If there's already something running we go directly into
+ * kill mode. */
+ if (IN_SET(s->state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST)) {
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS);
+ return -EAGAIN;
+ }
+
+ /* If we are currently cleaning, then abort it, brutally. */
+ if (s->state == SOCKET_CLEANING) {
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+ return 0;
+ }
+
+ assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING));
+
+ socket_enter_stop_pre(s, SOCKET_SUCCESS);
+ return 1;
+}
+
+static int socket_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", socket_state_to_string(s->state));
+ (void) serialize_item(f, "result", socket_result_to_string(s->result));
+ (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted);
+ (void) serialize_item_format(f, "n-refused", "%u", s->n_refused);
+
+ if (s->control_pid > 0)
+ (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid);
+
+ if (s->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id));
+
+ LIST_FOREACH(port, p, s->ports) {
+ int copy;
+
+ if (p->fd < 0)
+ continue;
+
+ copy = fdset_put_dup(fds, p->fd);
+ if (copy < 0)
+ return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m");
+
+ if (p->type == SOCKET_SOCKET) {
+ _cleanup_free_ char *t = NULL;
+
+ r = socket_address_print(&p->address, &t);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to format socket address: %m");
+
+ if (socket_address_family(&p->address) == AF_NETLINK)
+ (void) serialize_item_format(f, "netlink", "%i %s", copy, t);
+ else
+ (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t);
+ } else if (p->type == SOCKET_SPECIAL)
+ (void) serialize_item_format(f, "special", "%i %s", copy, p->path);
+ else if (p->type == SOCKET_MQUEUE)
+ (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path);
+ else if (p->type == SOCKET_USB_FUNCTION)
+ (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path);
+ else {
+ assert(p->type == SOCKET_FIFO);
+ (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path);
+ }
+ }
+
+ return 0;
+}
+
+static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+
+ if (streq(key, "state")) {
+ SocketState state;
+
+ state = socket_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ SocketResult f;
+
+ f = socket_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SOCKET_SUCCESS)
+ s->result = f;
+
+ } else if (streq(key, "n-accepted")) {
+ unsigned k;
+
+ if (safe_atou(value, &k) < 0)
+ log_unit_debug(u, "Failed to parse n-accepted value: %s", value);
+ else
+ s->n_accepted += k;
+ } else if (streq(key, "n-refused")) {
+ unsigned k;
+
+ if (safe_atou(value, &k) < 0)
+ log_unit_debug(u, "Failed to parse n-refused value: %s", value);
+ else
+ s->n_refused += k;
+ } else if (streq(key, "control-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse control-pid value: %s", value);
+ else
+ s->control_pid = pid;
+ } else if (streq(key, "control-command")) {
+ SocketExecCommand id;
+
+ id = socket_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+ else {
+ s->control_command_id = id;
+ s->control_command = s->exec_command[id];
+ }
+ } else if (streq(key, "fifo")) {
+ int fd, skip = 0;
+
+ if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse fifo value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_FIFO &&
+ path_equal_or_files_same(p->path, value+skip, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching fifo socket found: %s", value+skip);
+ }
+
+ } else if (streq(key, "special")) {
+ int fd, skip = 0;
+
+ if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse special value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_SPECIAL &&
+ path_equal_or_files_same(p->path, value+skip, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching special socket found: %s", value+skip);
+ }
+
+ } else if (streq(key, "mqueue")) {
+ int fd, skip = 0;
+
+ if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse mqueue value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_MQUEUE &&
+ streq(p->path, value+skip)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching mqueue socket found: %s", value+skip);
+ }
+
+ } else if (streq(key, "socket")) {
+ int fd, type, skip = 0;
+
+ if (sscanf(value, "%i %i %n", &fd, &type, &skip) < 2 || fd < 0 || type < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse socket value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ socket_address_is(&p->address, value+skip, type)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching %s socket found: %s",
+ socket_address_type_to_string(type), value+skip);
+ }
+
+ } else if (streq(key, "netlink")) {
+ int fd, skip = 0;
+
+ if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse socket value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ socket_address_is_netlink(&p->address, value+skip)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching netlink socket found: %s", value+skip);
+ }
+
+ } else if (streq(key, "ffs")) {
+ int fd, skip = 0;
+
+ if (sscanf(value, "%i %n", &fd, &skip) < 1 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse ffs value: %s", value);
+ else {
+ bool found = false;
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_USB_FUNCTION &&
+ path_equal_or_files_same(p->path, value+skip, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching ffs socket found: %s", value+skip);
+ }
+
+ } else
+ log_unit_debug(UNIT(s), "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void socket_distribute_fds(Unit *u, FDSet *fds) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+
+ LIST_FOREACH(port, p, s->ports) {
+ int fd;
+
+ if (p->type != SOCKET_SOCKET)
+ continue;
+
+ if (p->fd >= 0)
+ continue;
+
+ FDSET_FOREACH(fd, fds) {
+ if (socket_address_matches_fd(&p->address, fd)) {
+ p->fd = fdset_remove(fds, fd);
+ s->deserialized_state = SOCKET_LISTENING;
+ break;
+ }
+ }
+ }
+}
+
+_pure_ static UnitActiveState socket_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SOCKET(u)->state];
+}
+
+_pure_ static const char *socket_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return socket_state_to_string(SOCKET(u)->state);
+}
+
+const char* socket_port_type_to_string(SocketPort *p) {
+
+ assert(p);
+
+ switch (p->type) {
+
+ case SOCKET_SOCKET:
+
+ switch (p->address.type) {
+
+ case SOCK_STREAM:
+ return "Stream";
+
+ case SOCK_DGRAM:
+ return "Datagram";
+
+ case SOCK_SEQPACKET:
+ return "SequentialPacket";
+
+ case SOCK_RAW:
+ if (socket_address_family(&p->address) == AF_NETLINK)
+ return "Netlink";
+
+ _fallthrough_;
+ default:
+ return NULL;
+ }
+
+ case SOCKET_SPECIAL:
+ return "Special";
+
+ case SOCKET_MQUEUE:
+ return "MessageQueue";
+
+ case SOCKET_FIFO:
+ return "FIFO";
+
+ case SOCKET_USB_FUNCTION:
+ return "USBFunction";
+
+ default:
+ return NULL;
+ }
+}
+
+SocketType socket_port_type_from_string(const char *s) {
+ assert(s);
+
+ if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink"))
+ return SOCKET_SOCKET;
+ else if (streq(s, "Special"))
+ return SOCKET_SPECIAL;
+ else if (streq(s, "MessageQueue"))
+ return SOCKET_MQUEUE;
+ else if (streq(s, "FIFO"))
+ return SOCKET_FIFO;
+ else if (streq(s, "USBFunction"))
+ return SOCKET_USB_FUNCTION;
+ else
+ return _SOCKET_TYPE_INVALID;
+}
+
+_pure_ static bool socket_may_gc(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+
+ return s->n_connections == 0;
+}
+
+static int socket_accept_do(Socket *s, int fd) {
+ int cfd;
+
+ assert(s);
+ assert(fd >= 0);
+
+ cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+ if (cfd < 0)
+ /* Convert transient network errors into clean and well-defined EAGAIN */
+ return ERRNO_IS_ACCEPT_AGAIN(errno) ? -EAGAIN : -errno;
+
+ return cfd;
+}
+
+static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) {
+ _cleanup_close_pair_ int pair[2] = { -1, -1 };
+ int cfd, r;
+ pid_t pid;
+
+ assert(s);
+ assert(p);
+ assert(fd >= 0);
+
+ /* Similar to socket_address_listen_in_cgroup(), but for accept() rather than socket(): make sure that any
+ * connection socket is also properly associated with the cgroup. */
+
+ if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6))
+ goto shortcut;
+
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r == BPF_FIREWALL_UNSUPPORTED)
+ goto shortcut;
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m");
+ if (r == 0) {
+ /* Child */
+
+ pair[0] = safe_close(pair[0]);
+
+ cfd = socket_accept_do(s, fd);
+ if (cfd == -EAGAIN) /* spurious accept() */
+ _exit(EXIT_SUCCESS);
+ if (cfd < 0) {
+ log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = send_one_fd(pair[1], cfd, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pair[1] = safe_close(pair[1]);
+ cfd = receive_one_fd(pair[0], 0);
+
+ /* We synchronously wait for the helper, as it shouldn't be slow */
+ r = wait_for_terminate_and_check("(sd-accept)", pid, WAIT_LOG_ABNORMAL);
+ if (r < 0) {
+ safe_close(cfd);
+ return r;
+ }
+
+ /* If we received no fd, we got EIO here. If this happens with a process exit code of EXIT_SUCCESS
+ * this is a spurious accept(), let's convert that back to EAGAIN here. */
+ if (cfd == -EIO)
+ return -EAGAIN;
+ if (cfd < 0)
+ return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m");
+
+ return cfd;
+
+shortcut:
+ cfd = socket_accept_do(s, fd);
+ if (cfd == -EAGAIN) /* spurious accept(), skip it silently */
+ return -EAGAIN;
+ if (cfd < 0)
+ return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+
+ return cfd;
+}
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ SocketPort *p = ASSERT_PTR(userdata);
+ int cfd = -1;
+
+ assert(fd >= 0);
+
+ if (p->socket->state != SOCKET_LISTENING)
+ return 0;
+
+ log_unit_debug(UNIT(p->socket), "Incoming traffic");
+
+ if (revents != EPOLLIN) {
+ if (revents & EPOLLHUP)
+ log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that.");
+ else
+ log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents);
+ goto fail;
+ }
+
+ if (p->socket->accept &&
+ p->type == SOCKET_SOCKET &&
+ socket_address_can_accept(&p->address)) {
+
+ cfd = socket_accept_in_cgroup(p->socket, p, fd);
+ if (cfd == -EAGAIN) /* Spurious accept() */
+ return 0;
+ if (cfd < 0)
+ goto fail;
+
+ socket_apply_socket_options(p->socket, p, cfd);
+ }
+
+ socket_enter_running(p->socket, cfd);
+ return 0;
+
+fail:
+ socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Socket *s = SOCKET(u);
+ SocketResult f;
+
+ assert(s);
+ assert(pid >= 0);
+
+ if (pid != s->control_pid)
+ return;
+
+ s->control_pid = 0;
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = SOCKET_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SOCKET_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SOCKET_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SOCKET_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SOCKET_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Control process",
+ socket_exec_command_to_string(s->control_command_id),
+ f == SOCKET_SUCCESS,
+ code, status);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ if (s->control_command &&
+ s->control_command->command_next &&
+ f == SOCKET_SUCCESS) {
+
+ log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state));
+ socket_run_next(s);
+ } else {
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ /* No further commands for this step, so let's figure
+ * out what to do next */
+
+ log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state));
+
+ switch (s->state) {
+
+ case SOCKET_START_PRE:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_start_chown(s);
+ else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f);
+ break;
+
+ case SOCKET_START_CHOWN:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_start_post(s);
+ else
+ socket_enter_stop_pre(s, f);
+ break;
+
+ case SOCKET_START_POST:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_listening(s);
+ else
+ socket_enter_stop_pre(s, f);
+ break;
+
+ case SOCKET_STOP_PRE:
+ case SOCKET_STOP_PRE_SIGTERM:
+ case SOCKET_STOP_PRE_SIGKILL:
+ socket_enter_stop_post(s, f);
+ break;
+
+ case SOCKET_STOP_POST:
+ case SOCKET_FINAL_SIGTERM:
+ case SOCKET_FINAL_SIGKILL:
+ socket_enter_dead(s, f);
+ break;
+
+ case SOCKET_CLEANING:
+
+ if (s->clean_result == SOCKET_SUCCESS)
+ s->clean_result = f;
+
+ socket_enter_dead(s, SOCKET_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Socket *s = SOCKET(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SOCKET_START_PRE:
+ log_unit_warning(UNIT(s), "Starting timed out. Terminating.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_START_CHOWN:
+ case SOCKET_START_POST:
+ log_unit_warning(UNIT(s), "Starting timed out. Stopping.");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_PRE:
+ log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_PRE_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring.");
+ socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SOCKET_STOP_PRE_SIGKILL:
+ log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+ socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_POST:
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_FINAL_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Killing.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring.");
+ socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SOCKET_FINAL_SIGKILL:
+ log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode.");
+ socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SOCKET_SUCCESS)
+ s->clean_result = SOCKET_FAILURE_TIMEOUT;
+
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int socket_collect_fds(Socket *s, int **fds) {
+ size_t k = 0, n = 0;
+ int *rfds;
+
+ assert(s);
+ assert(fds);
+
+ /* Called from the service code for requesting our fds */
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd >= 0)
+ n++;
+ n += p->n_auxiliary_fds;
+ }
+
+ if (n <= 0) {
+ *fds = NULL;
+ return 0;
+ }
+
+ rfds = new(int, n);
+ if (!rfds)
+ return -ENOMEM;
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd >= 0)
+ rfds[k++] = p->fd;
+ for (size_t i = 0; i < p->n_auxiliary_fds; ++i)
+ rfds[k++] = p->auxiliary_fds[i];
+ }
+
+ assert(k == n);
+
+ *fds = rfds;
+ return (int) n;
+}
+
+static void socket_reset_failed(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ if (s->state == SOCKET_FAILED)
+ socket_set_state(s, SOCKET_DEAD);
+
+ s->result = SOCKET_SUCCESS;
+ s->clean_result = SOCKET_SUCCESS;
+}
+
+void socket_connection_unref(Socket *s) {
+ assert(s);
+
+ /* The service is dead. Yay!
+ *
+ * This is strictly for one-instance-per-connection
+ * services. */
+
+ assert(s->n_connections > 0);
+ s->n_connections--;
+
+ log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections);
+}
+
+static void socket_trigger_notify(Unit *u, Unit *other) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+ assert(other->type == UNIT_SERVICE);
+
+ /* Don't propagate state changes from the service if we are already down */
+ if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING))
+ return;
+
+ /* We don't care for the service state if we are in Accept=yes mode */
+ if (s->accept)
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ if (IN_SET(SERVICE(other)->state,
+ SERVICE_DEAD, SERVICE_FAILED,
+ SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_AUTO_RESTART))
+ socket_enter_listening(s);
+
+ if (SERVICE(other)->state == SERVICE_RUNNING)
+ socket_set_state(s, SOCKET_RUNNING);
+}
+
+static int socket_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ return unit_kill_common(u, who, signo, -1, SOCKET(u)->control_pid, error);
+}
+
+static int socket_get_timeout(Unit *u, usec_t *timeout) {
+ Socket *s = SOCKET(u);
+ usec_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+char *socket_fdname(Socket *s) {
+ assert(s);
+
+ /* Returns the name to use for $LISTEN_NAMES. If the user
+ * didn't specify anything specifically, use the socket unit's
+ * name as fallback. */
+
+ return s->fdname ?: UNIT(s)->id;
+}
+
+static int socket_control_pid(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ return s->control_pid;
+}
+
+static int socket_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (s->state != SOCKET_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ socket_unwatch_control_pid(s);
+ s->clean_result = SOCKET_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ r = socket_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec));
+ if (r < 0)
+ goto fail;
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ socket_set_state(s, SOCKET_CLEANING);
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m");
+ s->clean_result = SOCKET_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int socket_can_clean(Unit *u, ExecCleanMask *ret) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int socket_can_start(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = {
+ [SOCKET_EXEC_START_PRE] = "ExecStartPre",
+ [SOCKET_EXEC_START_CHOWN] = "ExecStartChown",
+ [SOCKET_EXEC_START_POST] = "ExecStartPost",
+ [SOCKET_EXEC_STOP_PRE] = "ExecStopPre",
+ [SOCKET_EXEC_STOP_POST] = "ExecStopPost"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand);
+
+static const char* const socket_result_table[_SOCKET_RESULT_MAX] = {
+ [SOCKET_SUCCESS] = "success",
+ [SOCKET_FAILURE_RESOURCES] = "resources",
+ [SOCKET_FAILURE_TIMEOUT] = "timeout",
+ [SOCKET_FAILURE_EXIT_CODE] = "exit-code",
+ [SOCKET_FAILURE_SIGNAL] = "signal",
+ [SOCKET_FAILURE_CORE_DUMP] = "core-dump",
+ [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit",
+ [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult);
+
+static const char* const socket_timestamping_table[_SOCKET_TIMESTAMPING_MAX] = {
+ [SOCKET_TIMESTAMPING_OFF] = "off",
+ [SOCKET_TIMESTAMPING_US] = "us",
+ [SOCKET_TIMESTAMPING_NS] = "ns",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_timestamping, SocketTimestamping);
+
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) {
+ SocketTimestamping t;
+ int r;
+
+ if (!p)
+ return _SOCKET_TIMESTAMPING_INVALID;
+
+ t = socket_timestamping_from_string(p);
+ if (t >= 0)
+ return t;
+
+ /* Let's alternatively support the various other aliases parse_time() accepts for ns and µs here,
+ * too. */
+ if (streq(p, "nsec"))
+ return SOCKET_TIMESTAMPING_NS;
+ if (STR_IN_SET(p, "usec", "µs", "μs")) /* Accept both small greek letter mu + micro sign unicode codepoints */
+ return SOCKET_TIMESTAMPING_US;
+
+ r = parse_boolean(p);
+ if (r < 0)
+ return _SOCKET_TIMESTAMPING_INVALID;
+
+ return r ? SOCKET_TIMESTAMPING_NS : SOCKET_TIMESTAMPING_OFF; /* If boolean yes, default to ns accuracy */
+}
+
+const UnitVTable socket_vtable = {
+ .object_size = sizeof(Socket),
+ .exec_context_offset = offsetof(Socket, exec_context),
+ .cgroup_context_offset = offsetof(Socket, cgroup_context),
+ .kill_context_offset = offsetof(Socket, kill_context),
+ .exec_runtime_offset = offsetof(Socket, exec_runtime),
+ .dynamic_creds_offset = offsetof(Socket, dynamic_creds),
+
+ .sections =
+ "Unit\0"
+ "Socket\0"
+ "Install\0",
+ .private_section = "Socket",
+
+ .can_transient = true,
+ .can_trigger = true,
+ .can_fail = true,
+
+ .init = socket_init,
+ .done = socket_done,
+ .load = socket_load,
+
+ .coldplug = socket_coldplug,
+
+ .dump = socket_dump,
+
+ .start = socket_start,
+ .stop = socket_stop,
+
+ .kill = socket_kill,
+ .clean = socket_clean,
+ .can_clean = socket_can_clean,
+
+ .get_timeout = socket_get_timeout,
+
+ .serialize = socket_serialize,
+ .deserialize_item = socket_deserialize_item,
+ .distribute_fds = socket_distribute_fds,
+
+ .active_state = socket_active_state,
+ .sub_state_to_string = socket_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = socket_may_gc,
+
+ .sigchld_event = socket_sigchld_event,
+
+ .trigger_notify = socket_trigger_notify,
+
+ .reset_failed = socket_reset_failed,
+
+ .control_pid = socket_control_pid,
+
+ .bus_set_property = bus_socket_set_property,
+ .bus_commit_properties = bus_socket_commit_properties,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Listening on %s.",
+ [JOB_FAILED] = "Failed to listen on %s.",
+ [JOB_TIMEOUT] = "Timed out starting %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Closed %s.",
+ [JOB_FAILED] = "Failed stopping %s.",
+ [JOB_TIMEOUT] = "Timed out stopping %s.",
+ },
+ },
+
+ .can_start = socket_can_start,
+};
diff --git a/src/core/socket.h b/src/core/socket.h
new file mode 100644
index 0000000..17e912a
--- /dev/null
+++ b/src/core/socket.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Socket Socket;
+typedef struct SocketPeer SocketPeer;
+
+#include "mount.h"
+#include "socket-util.h"
+#include "unit.h"
+
+typedef enum SocketExecCommand {
+ SOCKET_EXEC_START_PRE,
+ SOCKET_EXEC_START_CHOWN,
+ SOCKET_EXEC_START_POST,
+ SOCKET_EXEC_STOP_PRE,
+ SOCKET_EXEC_STOP_POST,
+ _SOCKET_EXEC_COMMAND_MAX,
+ _SOCKET_EXEC_COMMAND_INVALID = -EINVAL,
+} SocketExecCommand;
+
+typedef enum SocketType {
+ SOCKET_SOCKET,
+ SOCKET_FIFO,
+ SOCKET_SPECIAL,
+ SOCKET_MQUEUE,
+ SOCKET_USB_FUNCTION,
+ _SOCKET_TYPE_MAX,
+ _SOCKET_TYPE_INVALID = -EINVAL,
+} SocketType;
+
+typedef enum SocketResult {
+ SOCKET_SUCCESS,
+ SOCKET_FAILURE_RESOURCES,
+ SOCKET_FAILURE_TIMEOUT,
+ SOCKET_FAILURE_EXIT_CODE,
+ SOCKET_FAILURE_SIGNAL,
+ SOCKET_FAILURE_CORE_DUMP,
+ SOCKET_FAILURE_START_LIMIT_HIT,
+ SOCKET_FAILURE_TRIGGER_LIMIT_HIT,
+ SOCKET_FAILURE_SERVICE_START_LIMIT_HIT,
+ _SOCKET_RESULT_MAX,
+ _SOCKET_RESULT_INVALID = -EINVAL,
+} SocketResult;
+
+typedef struct SocketPort {
+ Socket *socket;
+
+ SocketType type;
+ int fd;
+ int *auxiliary_fds;
+ size_t n_auxiliary_fds;
+
+ SocketAddress address;
+ char *path;
+ sd_event_source *event_source;
+
+ LIST_FIELDS(struct SocketPort, port);
+} SocketPort;
+
+typedef enum SocketTimestamping {
+ SOCKET_TIMESTAMPING_OFF,
+ SOCKET_TIMESTAMPING_US, /* SO_TIMESTAMP */
+ SOCKET_TIMESTAMPING_NS, /* SO_TIMESTAMPNS */
+ _SOCKET_TIMESTAMPING_MAX,
+ _SOCKET_TIMESTAMPING_INVALID = -EINVAL,
+} SocketTimestamping;
+
+struct Socket {
+ Unit meta;
+
+ LIST_HEAD(SocketPort, ports);
+
+ Set *peers_by_address;
+
+ unsigned n_accepted;
+ unsigned n_connections;
+ unsigned n_refused;
+ unsigned max_connections;
+ unsigned max_connections_per_source;
+
+ unsigned backlog;
+ unsigned keep_alive_cnt;
+ usec_t timeout_usec;
+ usec_t keep_alive_time;
+ usec_t keep_alive_interval;
+ usec_t defer_accept;
+
+ ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX];
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+ DynamicCreds dynamic_creds;
+
+ /* For Accept=no sockets refers to the one service we'll
+ * activate. For Accept=yes sockets is either NULL, or filled
+ * to refer to the next service we spawn. */
+ UnitRef service;
+
+ SocketState state, deserialized_state;
+
+ sd_event_source *timer_event_source;
+
+ ExecCommand* control_command;
+ SocketExecCommand control_command_id;
+ pid_t control_pid;
+
+ mode_t directory_mode;
+ mode_t socket_mode;
+
+ SocketResult result;
+ SocketResult clean_result;
+
+ char **symlinks;
+
+ bool accept;
+ bool remove_on_stop;
+ bool writable;
+ bool flush_pending;
+
+ int socket_protocol;
+
+ /* Socket options */
+ bool keep_alive;
+ bool no_delay;
+ bool free_bind;
+ bool transparent;
+ bool broadcast;
+ bool pass_cred;
+ bool pass_sec;
+ bool pass_pktinfo;
+ SocketTimestamping timestamping;
+
+ /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */
+ SocketAddressBindIPv6Only bind_ipv6_only;
+
+ int priority;
+ int mark;
+ size_t receive_buffer;
+ size_t send_buffer;
+ int ip_tos;
+ int ip_ttl;
+ size_t pipe_size;
+ char *bind_to_device;
+ char *tcp_congestion;
+ bool reuse_port;
+ long mq_maxmsg;
+ long mq_msgsize;
+
+ char *smack;
+ char *smack_ip_in;
+ char *smack_ip_out;
+
+ bool selinux_context_from_net;
+
+ char *user, *group;
+
+ char *fdname;
+
+ RateLimit trigger_limit;
+};
+
+SocketPeer *socket_peer_ref(SocketPeer *p);
+SocketPeer *socket_peer_unref(SocketPeer *p);
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **p);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref);
+
+/* Called from the service code when collecting fds */
+int socket_collect_fds(Socket *s, int **fds);
+
+/* Called from the service code when a per-connection service ended */
+void socket_connection_unref(Socket *s);
+
+SocketPort *socket_port_free(SocketPort *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPort*, socket_port_free);
+
+void socket_free_ports(Socket *s);
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret);
+
+char *socket_fdname(Socket *s);
+
+extern const UnitVTable socket_vtable;
+
+const char* socket_exec_command_to_string(SocketExecCommand i) _const_;
+SocketExecCommand socket_exec_command_from_string(const char *s) _pure_;
+
+const char* socket_result_to_string(SocketResult i) _const_;
+SocketResult socket_result_from_string(const char *s) _pure_;
+
+const char* socket_port_type_to_string(SocketPort *p) _pure_;
+SocketType socket_port_type_from_string(const char *p) _pure_;
+
+const char* socket_timestamping_to_string(SocketTimestamping p) _const_;
+SocketTimestamping socket_timestamping_from_string(const char *p) _pure_;
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) _pure_;
+
+DEFINE_CAST(SOCKET, Socket);
diff --git a/src/core/swap.c b/src/core/swap.c
new file mode 100644
index 0000000..5c83c47
--- /dev/null
+++ b/src/core/swap.c
@@ -0,0 +1,1693 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "dbus-swap.h"
+#include "dbus-unit.h"
+#include "device-util.h"
+#include "device.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = {
+ [SWAP_DEAD] = UNIT_INACTIVE,
+ [SWAP_ACTIVATING] = UNIT_ACTIVATING,
+ [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE,
+ [SWAP_ACTIVE] = UNIT_ACTIVE,
+ [SWAP_DEACTIVATING] = UNIT_DEACTIVATING,
+ [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING,
+ [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING,
+ [SWAP_FAILED] = UNIT_FAILED,
+ [SWAP_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int swap_process_proc_swaps(Manager *m);
+
+static bool SWAP_STATE_WITH_PROCESS(SwapState state) {
+ return IN_SET(state,
+ SWAP_ACTIVATING,
+ SWAP_ACTIVATING_DONE,
+ SWAP_DEACTIVATING,
+ SWAP_DEACTIVATING_SIGTERM,
+ SWAP_DEACTIVATING_SIGKILL,
+ SWAP_CLEANING);
+}
+
+_pure_ static UnitActiveState swap_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SWAP(u)->state];
+}
+
+_pure_ static const char *swap_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return swap_state_to_string(SWAP(u)->state);
+}
+
+_pure_ static bool swap_may_gc(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ if (s->from_proc_swaps)
+ return false;
+
+ return true;
+}
+
+_pure_ static bool swap_is_extrinsic(Unit *u) {
+ assert(SWAP(u));
+
+ return MANAGER_IS_USER(u->manager);
+}
+
+static void swap_unset_proc_swaps(Swap *s) {
+ assert(s);
+
+ if (!s->from_proc_swaps)
+ return;
+
+ s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what);
+ s->from_proc_swaps = false;
+}
+
+static int swap_set_devnode(Swap *s, const char *devnode) {
+ Hashmap *swaps;
+ Swap *first;
+ int r;
+
+ assert(s);
+
+ r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ swaps = UNIT(s)->manager->swaps_by_devnode;
+
+ if (s->devnode) {
+ first = hashmap_get(swaps, s->devnode);
+
+ LIST_REMOVE(same_devnode, first, s);
+ if (first)
+ hashmap_replace(swaps, first->devnode, first);
+ else
+ hashmap_remove(swaps, s->devnode);
+
+ s->devnode = mfree(s->devnode);
+ }
+
+ if (devnode) {
+ s->devnode = strdup(devnode);
+ if (!s->devnode)
+ return -ENOMEM;
+
+ first = hashmap_get(swaps, s->devnode);
+ LIST_PREPEND(same_devnode, first, s);
+
+ return hashmap_replace(swaps, first->devnode, first);
+ }
+
+ return 0;
+}
+
+static void swap_init(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_STUB);
+
+ s->timeout_usec = u->manager->default_timeout_start_usec;
+
+ s->exec_context.std_output = u->manager->default_std_output;
+ s->exec_context.std_error = u->manager->default_std_error;
+
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+ u->ignore_on_isolate = true;
+}
+
+static void swap_unwatch_control_pid(Swap *s) {
+ assert(s);
+
+ if (s->control_pid <= 0)
+ return;
+
+ unit_unwatch_pid(UNIT(s), TAKE_PID(s->control_pid));
+}
+
+static void swap_done(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ swap_unset_proc_swaps(s);
+ swap_set_devnode(s, NULL);
+
+ s->what = mfree(s->what);
+ s->parameters_fragment.what = mfree(s->parameters_fragment.what);
+ s->parameters_fragment.options = mfree(s->parameters_fragment.options);
+
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, false);
+ exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+
+ dynamic_creds_unref(&s->dynamic_creds);
+
+ swap_unwatch_control_pid(s);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int swap_arm_timer(Swap *s, usec_t usec) {
+ int r;
+
+ assert(s);
+
+ if (s->timer_event_source) {
+ r = sd_event_source_set_time(s->timer_event_source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
+ }
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->timer_event_source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ swap_dispatch_timer, s);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s->timer_event_source, "swap-timer");
+
+ return 0;
+}
+
+static SwapParameters* swap_get_parameters(Swap *s) {
+ assert(s);
+
+ if (s->from_proc_swaps)
+ return &s->parameters_proc_swaps;
+
+ if (s->from_fragment)
+ return &s->parameters_fragment;
+
+ return NULL;
+}
+
+static int swap_add_device_dependencies(Swap *s) {
+ UnitDependencyMask mask;
+ SwapParameters *p;
+ int r;
+
+ assert(s);
+
+ if (!s->what)
+ return 0;
+
+ p = swap_get_parameters(s);
+ if (!p || !p->what)
+ return 0;
+
+ mask = s->from_proc_swaps ? UNIT_DEPENDENCY_PROC_SWAP : UNIT_DEPENDENCY_FILE;
+
+ if (is_device_path(p->what)) {
+ r = unit_add_node_dependency(UNIT(s), p->what, UNIT_REQUIRES, mask);
+ if (r < 0)
+ return r;
+
+ return unit_add_blockdev_dependency(UNIT(s), p->what, mask);
+ }
+
+ /* File based swap devices need to be ordered after systemd-remount-fs.service, since they might need
+ * a writable file system. */
+ return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, mask);
+}
+
+static int swap_add_default_dependencies(Swap *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(UNIT(s)->manager))
+ return 0;
+
+ if (detect_container() > 0)
+ return 0;
+
+ /* swap units generated for the swap dev links are missing the
+ * ordering dep against the swap target. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int swap_verify(Swap *s) {
+ _cleanup_free_ char *e = NULL;
+ int r;
+
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ r = unit_name_from_path(s->what, ".swap", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m");
+
+ if (!unit_has_name(UNIT(s), e))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Value of What= and unit name do not match, not loading.");
+
+ if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load.");
+
+ return 0;
+}
+
+static int swap_load_devnode(Swap *s) {
+ _cleanup_free_ char *p = NULL;
+ struct stat st;
+ int r;
+
+ assert(s);
+
+ if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode))
+ return 0;
+
+ r = devname_from_stat_rdev(&st, &p);
+ if (r < 0) {
+ log_unit_full_errno(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to get device node for swap %s: %m", s->what);
+ return 0;
+ }
+
+ return swap_set_devnode(s, p);
+}
+
+static int swap_add_extras(Swap *s) {
+ int r;
+
+ assert(s);
+
+ if (UNIT(s)->fragment_path)
+ s->from_fragment = true;
+
+ if (!s->what) {
+ if (s->parameters_fragment.what)
+ s->what = strdup(s->parameters_fragment.what);
+ else if (s->parameters_proc_swaps.what)
+ s->what = strdup(s->parameters_proc_swaps.what);
+ else {
+ r = unit_name_to_path(UNIT(s)->id, &s->what);
+ if (r < 0)
+ return r;
+ }
+
+ if (!s->what)
+ return -ENOMEM;
+ }
+
+ path_simplify(s->what);
+
+ if (!UNIT(s)->description) {
+ r = unit_set_description(UNIT(s), s->what);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+
+ r = swap_add_device_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = swap_load_devnode(s);
+ if (r < 0)
+ return r;
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = swap_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int swap_load(Unit *u) {
+ Swap *s = SWAP(u);
+ int r, q = 0;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ /* Load a .swap file */
+ bool fragment_optional = s->from_proc_swaps;
+ r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+ /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is
+ * already active. */
+ if (u->load_state == UNIT_LOADED || s->from_proc_swaps)
+ q = swap_add_extras(s);
+
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ return swap_verify(s);
+}
+
+static int swap_setup_unit(
+ Manager *m,
+ const char *what,
+ const char *what_proc_swaps,
+ int priority,
+ bool set_flags) {
+
+ _cleanup_free_ char *e = NULL;
+ bool delete = false;
+ Unit *u = NULL;
+ int r;
+ SwapParameters *p;
+
+ assert(m);
+ assert(what);
+ assert(what_proc_swaps);
+
+ r = unit_name_from_path(what, ".swap", &e);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m");
+
+ u = manager_get_unit(m, e);
+ if (u &&
+ SWAP(u)->from_proc_swaps &&
+ !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps))
+ return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+ "Swap %s appeared twice with different device paths %s and %s",
+ e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps);
+
+ if (!u) {
+ delete = true;
+
+ r = unit_new_for_name(m, sizeof(Swap), e, &u);
+ if (r < 0)
+ goto fail;
+
+ SWAP(u)->what = strdup(what);
+ if (!SWAP(u)->what) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ unit_add_to_load_queue(u);
+ } else
+ delete = false;
+
+ p = &SWAP(u)->parameters_proc_swaps;
+
+ if (!p->what) {
+ p->what = strdup(what_proc_swaps);
+ if (!p->what) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be
+ * loaded. After all we can load it now, from the data in /proc/swaps. */
+ if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ u->load_state = UNIT_LOADED;
+ u->load_error = 0;
+ }
+
+ if (set_flags) {
+ SWAP(u)->is_active = true;
+ SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps;
+ }
+
+ SWAP(u)->from_proc_swaps = true;
+
+ p->priority = priority;
+ p->priority_set = true;
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+
+fail:
+ log_unit_warning_errno(u, r, "Failed to load swap unit: %m");
+
+ if (delete)
+ unit_free(u);
+
+ return r;
+}
+
+static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) {
+ _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+ const char *dn, *devlink;
+ struct stat st, st_link;
+ int r;
+
+ assert(m);
+
+ if (swap_setup_unit(m, device, device, prio, set_flags) < 0)
+ return;
+
+ /* If this is a block device, then let's add duplicates for
+ * all other names of this block device */
+ if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode))
+ return;
+
+ r = sd_device_new_from_stat_rdev(&d, &st);
+ if (r < 0)
+ return (void) log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to allocate device for swap %s: %m", device);
+
+ /* Add the main device node */
+ if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device))
+ (void) swap_setup_unit(m, dn, device, prio, set_flags);
+
+ /* Add additional units for all symlinks */
+ FOREACH_DEVICE_DEVLINK(d, devlink) {
+
+ /* Don't bother with the /dev/block links */
+ if (streq(devlink, device))
+ continue;
+
+ if (path_startswith(devlink, "/dev/block/"))
+ continue;
+
+ if (stat(devlink, &st_link) >= 0 &&
+ (!S_ISBLK(st_link.st_mode) ||
+ st_link.st_rdev != st.st_rdev))
+ continue;
+
+ (void) swap_setup_unit(m, devlink, device, prio, set_flags);
+ }
+}
+
+static void swap_set_state(Swap *s, SwapState state) {
+ SwapState old_state;
+
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!SWAP_STATE_WITH_PROCESS(state)) {
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ swap_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0);
+
+ /* If there other units for the same device node have a job
+ queued it might be worth checking again if it is runnable
+ now. This is necessary, since swap_start() refuses
+ operation with EAGAIN if there's already another job for
+ the same device node queued. */
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job)
+ job_add_to_run_queue(UNIT(other)->job);
+}
+
+static int swap_coldplug(Unit *u) {
+ Swap *s = SWAP(u);
+ SwapState new_state = SWAP_DEAD;
+ int r;
+
+ assert(s);
+ assert(s->state == SWAP_DEAD);
+
+ if (s->deserialized_state != s->state)
+ new_state = s->deserialized_state;
+ else if (s->from_proc_swaps)
+ new_state = SWAP_ACTIVE;
+
+ if (new_state == s->state)
+ return 0;
+
+ if (s->control_pid > 0 &&
+ pid_is_unwaited(s->control_pid) &&
+ SWAP_STATE_WITH_PROCESS(new_state)) {
+
+ r = unit_watch_pid(UNIT(s), s->control_pid, false);
+ if (r < 0)
+ return r;
+
+ r = swap_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED)) {
+ (void) unit_setup_dynamic_creds(u);
+ (void) unit_setup_exec_runtime(u);
+ }
+
+ swap_set_state(s, new_state);
+ return 0;
+}
+
+static void swap_dump(Unit *u, FILE *f, const char *prefix) {
+ Swap *s = SWAP(u);
+ SwapParameters *p;
+
+ assert(s);
+ assert(f);
+
+ if (s->from_proc_swaps)
+ p = &s->parameters_proc_swaps;
+ else if (s->from_fragment)
+ p = &s->parameters_fragment;
+ else
+ p = NULL;
+
+ fprintf(f,
+ "%sSwap State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sWhat: %s\n"
+ "%sFrom /proc/swaps: %s\n"
+ "%sFrom fragment: %s\n"
+ "%sExtrinsic: %s\n",
+ prefix, swap_state_to_string(s->state),
+ prefix, swap_result_to_string(s->result),
+ prefix, swap_result_to_string(s->clean_result),
+ prefix, s->what,
+ prefix, yes_no(s->from_proc_swaps),
+ prefix, yes_no(s->from_fragment),
+ prefix, yes_no(swap_is_extrinsic(u)));
+
+ if (s->devnode)
+ fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode);
+
+ if (p)
+ fprintf(f,
+ "%sPriority: %i\n"
+ "%sOptions: %s\n",
+ prefix, p->priority,
+ prefix, strempty(p->options));
+
+ fprintf(f,
+ "%sTimeoutSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+ if (s->control_pid > 0)
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid);
+
+ exec_context_dump(&s->exec_context, f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
+
+ _cleanup_(exec_params_clear) ExecParameters exec_params = {
+ .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
+ .stdin_fd = -1,
+ .stdout_fd = -1,
+ .stderr_fd = -1,
+ .exec_fd = -1,
+ };
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(c);
+ assert(_pid);
+
+ r = unit_prepare_exec(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
+ if (r < 0)
+ goto fail;
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ goto fail;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->dynamic_creds,
+ &pid);
+ if (r < 0)
+ goto fail;
+
+ r = unit_watch_pid(UNIT(s), pid, true);
+ if (r < 0)
+ goto fail;
+
+ *_pid = pid;
+
+ return 0;
+
+fail:
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ return r;
+}
+
+static void swap_enter_dead(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result));
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+ swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD);
+
+ s->exec_runtime = exec_runtime_unref(s->exec_runtime, true);
+
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ unit_unref_uid_gid(UNIT(s), true);
+
+ dynamic_creds_destroy(&s->dynamic_creds);
+}
+
+static void swap_enter_active(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ swap_set_state(s, SWAP_ACTIVE);
+}
+
+static void swap_enter_dead_or_active(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->from_proc_swaps) {
+ swap_enter_active(s, f);
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job)
+ swap_enter_dead_or_active(other, f);
+ } else
+ swap_enter_dead(s, f);
+}
+
+static int state_to_kill_operation(Swap *s, SwapState state) {
+ if (state == SWAP_DEACTIVATING_SIGTERM) {
+ if (unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+ else
+ return KILL_TERMINATE;
+ }
+
+ return KILL_KILL;
+}
+
+static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ r = unit_kill_context(UNIT(s),
+ &s->kill_context,
+ state_to_kill_operation(s, state),
+ -1,
+ s->control_pid,
+ false);
+ if (r < 0)
+ goto fail;
+
+ if (r > 0) {
+ r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
+ if (r < 0)
+ goto fail;
+
+ swap_set_state(s, state);
+ } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill)
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+ else
+ swap_enter_dead_or_active(s, SWAP_SUCCESS);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_activating(Swap *s) {
+ _cleanup_free_ char *opts = NULL;
+ int r;
+
+ assert(s);
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+ s->control_command_id = SWAP_EXEC_ACTIVATE;
+ s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE;
+
+ if (s->from_fragment) {
+ int priority = 0;
+
+ r = fstab_find_pri(s->parameters_fragment.options, &priority);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options);
+ else if (r > 0 && s->parameters_fragment.priority_set)
+ log_unit_warning(UNIT(s), "Duplicate swap priority configuration by Priority= and Options= fields.");
+
+ if (r <= 0 && s->parameters_fragment.priority_set) {
+ if (s->parameters_fragment.options)
+ r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority);
+ else
+ r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority);
+ if (r < 0) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ }
+ }
+
+ r = exec_command_set(s->control_command, "/sbin/swapon", "--fixpgsz", NULL);
+ if (r < 0)
+ goto fail;
+
+ if (s->parameters_fragment.options || opts) {
+ r = exec_command_append(s->control_command, "-o",
+ opts ?: s->parameters_fragment.options, NULL);
+ if (r < 0)
+ goto fail;
+ }
+
+ r = exec_command_append(s->control_command, s->what, NULL);
+ if (r < 0)
+ goto fail;
+
+ swap_unwatch_control_pid(s);
+
+ r = swap_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ swap_set_state(s, SWAP_ACTIVATING);
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapon' task: %m");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_deactivating(Swap *s) {
+ int r;
+
+ assert(s);
+
+ s->control_command_id = SWAP_EXEC_DEACTIVATE;
+ s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE;
+
+ r = exec_command_set(s->control_command,
+ "/sbin/swapoff",
+ s->what,
+ NULL);
+ if (r < 0)
+ goto fail;
+
+ swap_unwatch_control_pid(s);
+
+ r = swap_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ swap_set_state(s, SWAP_DEACTIVATING);
+
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to run 'swapoff' task: %m");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_cycle_clear(Swap *s) {
+ assert(s);
+
+ s->result = SWAP_SUCCESS;
+ exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+ UNIT(s)->reset_accounting = true;
+}
+
+static int swap_start(Unit *u) {
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later please! */
+ if (IN_SET(s->state,
+ SWAP_DEACTIVATING,
+ SWAP_DEACTIVATING_SIGTERM,
+ SWAP_DEACTIVATING_SIGKILL,
+ SWAP_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (s->state == SWAP_ACTIVATING)
+ return 0;
+
+ assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED));
+
+ if (detect_container() > 0)
+ return -EPERM;
+
+ /* If there's a job for another swap unit for the same node
+ * running, then let's not dispatch this one for now, and wait
+ * until that other job has finished. */
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING)
+ return -EAGAIN;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ swap_cycle_clear(s);
+ swap_enter_activating(s);
+ return 1;
+}
+
+static int swap_stop(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ switch (s->state) {
+
+ case SWAP_DEACTIVATING:
+ case SWAP_DEACTIVATING_SIGTERM:
+ case SWAP_DEACTIVATING_SIGKILL:
+ /* Already on it */
+ return 0;
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+ /* There's a control process pending, directly enter kill mode */
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS);
+ return 0;
+
+ case SWAP_ACTIVE:
+ if (detect_container() > 0)
+ return -EPERM;
+
+ swap_enter_deactivating(s);
+ return 1;
+
+ case SWAP_CLEANING:
+ /* If we are currently cleaning, then abort it, brutally. */
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int swap_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", swap_state_to_string(s->state));
+ (void) serialize_item(f, "result", swap_result_to_string(s->result));
+
+ if (s->control_pid > 0)
+ (void) serialize_item_format(f, "control-pid", PID_FMT, s->control_pid);
+
+ if (s->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id));
+
+ return 0;
+}
+
+static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ SwapState state;
+
+ state = swap_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ SwapResult f;
+
+ f = swap_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SWAP_SUCCESS)
+ s->result = f;
+ } else if (streq(key, "control-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse control-pid value: %s", value);
+ else
+ s->control_pid = pid;
+
+ } else if (streq(key, "control-command")) {
+ SwapExecCommand id;
+
+ id = swap_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+ else {
+ s->control_command_id = id;
+ s->control_command = s->exec_command + id;
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Swap *s = SWAP(u);
+ SwapResult f;
+
+ assert(s);
+ assert(pid >= 0);
+
+ if (pid != s->control_pid)
+ return;
+
+ /* Let's scan /proc/swaps before we process SIGCHLD. For the reasoning see the similar code in
+ * mount.c */
+ (void) swap_process_proc_swaps(u->manager);
+
+ s->control_pid = 0;
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = SWAP_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SWAP_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SWAP_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SWAP_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Swap process",
+ swap_exec_command_to_string(s->control_command_id),
+ f == SWAP_SUCCESS,
+ code, status);
+
+ switch (s->state) {
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+
+ if (f == SWAP_SUCCESS || s->from_proc_swaps)
+ swap_enter_active(s, f);
+ else
+ swap_enter_dead(s, f);
+ break;
+
+ case SWAP_DEACTIVATING:
+ case SWAP_DEACTIVATING_SIGKILL:
+ case SWAP_DEACTIVATING_SIGTERM:
+
+ swap_enter_dead_or_active(s, f);
+ break;
+
+ case SWAP_CLEANING:
+ if (s->clean_result == SWAP_SUCCESS)
+ s->clean_result = f;
+
+ swap_enter_dead(s, SWAP_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Swap *s = SWAP(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+ log_unit_warning(UNIT(s), "Activation timed out. Stopping.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_DEACTIVATING:
+ log_unit_warning(UNIT(s), "Deactivation timed out. Stopping.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_DEACTIVATING_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Swap process timed out. Killing.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring.");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SWAP_DEACTIVATING_SIGKILL:
+ log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring.");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SWAP_SUCCESS)
+ s->clean_result = SWAP_FAILURE_TIMEOUT;
+
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int swap_load_proc_swaps(Manager *m, bool set_flags) {
+ assert(m);
+
+ rewind(m->proc_swaps);
+
+ (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n");
+
+ for (unsigned i = 1;; i++) {
+ _cleanup_free_ char *dev = NULL, *d = NULL;
+ int prio = 0, k;
+
+ k = fscanf(m->proc_swaps,
+ "%ms " /* device/file */
+ "%*s " /* type of swap */
+ "%*s " /* swap size */
+ "%*s " /* used */
+ "%i\n", /* priority */
+ &dev, &prio);
+ if (k != 2) {
+ if (k == EOF)
+ break;
+
+ log_warning("Failed to parse /proc/swaps:%u.", i);
+ continue;
+ }
+
+ ssize_t l = cunescape(dev, UNESCAPE_RELAX, &d);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape device path: %m");
+
+ device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP);
+
+ (void) swap_process_new(m, d, prio, set_flags);
+ }
+
+ return 0;
+}
+
+static int swap_process_proc_swaps(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = swap_load_proc_swaps(m, true);
+ if (r < 0) {
+ log_error_errno(r, "Failed to reread /proc/swaps: %m");
+
+ /* Reset flags, just in case, for late calls */
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+ Swap *swap = SWAP(u);
+
+ assert(swap);
+
+ swap->is_active = swap->just_activated = false;
+ }
+
+ return 0;
+ }
+
+ manager_dispatch_load_queue(m);
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+ Swap *swap = SWAP(u);
+
+ assert(swap);
+
+ if (!swap->is_active) {
+
+ swap_unset_proc_swaps(swap);
+
+ switch (swap->state) {
+
+ case SWAP_ACTIVE:
+ /* This has just been deactivated */
+ swap_enter_dead(swap, SWAP_SUCCESS);
+ break;
+
+ default:
+ /* Fire again */
+ swap_set_state(swap, swap->state);
+ break;
+ }
+
+ if (swap->what)
+ device_found_node(m, swap->what, DEVICE_NOT_FOUND, DEVICE_FOUND_SWAP);
+
+ } else if (swap->just_activated) {
+
+ /* New swap entry */
+
+ switch (swap->state) {
+
+ case SWAP_DEAD:
+ case SWAP_FAILED:
+ (void) unit_acquire_invocation_id(u);
+ swap_cycle_clear(swap);
+ swap_enter_active(swap, SWAP_SUCCESS);
+ break;
+
+ case SWAP_ACTIVATING:
+ swap_set_state(swap, SWAP_ACTIVATING_DONE);
+ break;
+
+ default:
+ /* Nothing really changed, but let's
+ * issue an notification call
+ * nonetheless, in case somebody is
+ * waiting for this. */
+ swap_set_state(swap, swap->state);
+ break;
+ }
+ }
+
+ /* Reset the flags for later calls */
+ swap->is_active = swap->just_activated = false;
+ }
+
+ return 1;
+}
+
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(revents & EPOLLPRI);
+
+ return swap_process_proc_swaps(m);
+}
+
+static Unit *swap_following(Unit *u) {
+ Swap *s = SWAP(u);
+ Swap *first = NULL;
+
+ assert(s);
+
+ /* If the user configured the swap through /etc/fstab or
+ * a device unit, follow that. */
+
+ if (s->from_fragment)
+ return NULL;
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (other->from_fragment)
+ return UNIT(other);
+
+ /* Otherwise, make everybody follow the unit that's named after
+ * the swap device in the kernel */
+
+ if (streq_ptr(s->what, s->devnode))
+ return NULL;
+
+ LIST_FOREACH(same_devnode, other, s->same_devnode_next)
+ if (streq_ptr(other->what, other->devnode))
+ return UNIT(other);
+
+ LIST_FOREACH_BACKWARDS(same_devnode, other, s->same_devnode_prev) {
+ if (streq_ptr(other->what, other->devnode))
+ return UNIT(other);
+
+ first = other;
+ }
+
+ /* Fall back to the first on the list */
+ return UNIT(first);
+}
+
+static int swap_following_set(Unit *u, Set **_set) {
+ Swap *s = SWAP(u);
+ _cleanup_set_free_ Set *set = NULL;
+ int r;
+
+ assert(s);
+ assert(_set);
+
+ if (LIST_JUST_US(same_devnode, s)) {
+ *_set = NULL;
+ return 0;
+ }
+
+ set = set_new(NULL);
+ if (!set)
+ return -ENOMEM;
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ *_set = TAKE_PTR(set);
+ return 1;
+}
+
+static void swap_shutdown(Manager *m) {
+ assert(m);
+
+ m->swap_event_source = sd_event_source_disable_unref(m->swap_event_source);
+ m->proc_swaps = safe_fclose(m->proc_swaps);
+ m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode);
+}
+
+static void swap_enumerate(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->proc_swaps) {
+ m->proc_swaps = fopen("/proc/swaps", "re");
+ if (!m->proc_swaps) {
+ if (errno == ENOENT)
+ log_debug_errno(errno, "Not swap enabled, skipping enumeration.");
+ else
+ log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m");
+
+ return;
+ }
+
+ r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to watch /proc/swaps: %m");
+ goto fail;
+ }
+
+ /* Dispatch this before we dispatch SIGCHLD, so that
+ * we always get the events from /proc/swaps before
+ * the SIGCHLD of /sbin/swapon. */
+ r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ if (r < 0) {
+ log_error_errno(r, "Failed to change /proc/swaps priority: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(m->swap_event_source, "swap-proc");
+ }
+
+ r = swap_load_proc_swaps(m, false);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ swap_shutdown(m);
+}
+
+int swap_process_device_new(Manager *m, sd_device *dev) {
+ _cleanup_free_ char *e = NULL;
+ const char *dn, *devlink;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(dev);
+
+ if (sd_device_get_devname(dev, &dn) < 0)
+ return 0;
+
+ r = unit_name_from_path(dn, ".swap", &e);
+ if (r < 0) {
+ log_debug_errno(r, "Cannot convert device name '%s' to unit name, ignoring: %m", dn);
+ return 0;
+ }
+
+ u = manager_get_unit(m, e);
+ if (u)
+ r = swap_set_devnode(SWAP(u), dn);
+
+ FOREACH_DEVICE_DEVLINK(dev, devlink) {
+ _cleanup_free_ char *n = NULL;
+ int q;
+
+ q = unit_name_from_path(devlink, ".swap", &n);
+ if (q == -EINVAL) /* If the name is not convertible to unit name, we can't manage it */
+ continue;
+ if (q < 0)
+ return q;
+
+ u = manager_get_unit(m, n);
+ if (u) {
+ q = swap_set_devnode(SWAP(u), dn);
+ if (q < 0)
+ r = q;
+ }
+ }
+
+ return r;
+}
+
+int swap_process_device_remove(Manager *m, sd_device *dev) {
+ const char *dn;
+ int r;
+ Swap *s;
+
+ r = sd_device_get_devname(dev, &dn);
+ if (r < 0)
+ return 0;
+
+ while ((s = hashmap_get(m->swaps_by_devnode, dn))) {
+ int q;
+
+ q = swap_set_devnode(s, NULL);
+ if (q < 0)
+ r = q;
+ }
+
+ return r;
+}
+
+static void swap_reset_failed(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ if (s->state == SWAP_FAILED)
+ swap_set_state(s, SWAP_DEAD);
+
+ s->result = SWAP_SUCCESS;
+ s->clean_result = SWAP_SUCCESS;
+}
+
+static int swap_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
+ return unit_kill_common(u, who, signo, -1, SWAP(u)->control_pid, error);
+}
+
+static int swap_get_timeout(Unit *u, usec_t *timeout) {
+ Swap *s = SWAP(u);
+ usec_t t;
+ int r;
+
+ assert(s);
+ assert(u);
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static bool swap_supported(void) {
+ static int supported = -1;
+
+ /* If swap support is not available in the kernel, or we are
+ * running in a container we don't support swap units, and any
+ * attempts to starting one should fail immediately. */
+
+ if (supported < 0)
+ supported =
+ access("/proc/swaps", F_OK) >= 0 &&
+ detect_container() <= 0;
+
+ return supported;
+}
+
+static int swap_control_pid(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ return s->control_pid;
+}
+
+static int swap_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (s->state != SWAP_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ swap_unwatch_control_pid(s);
+ s->clean_result = SWAP_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+ r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->exec_context.timeout_clean_usec));
+ if (r < 0)
+ goto fail;
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0)
+ goto fail;
+
+ swap_set_state(s, SWAP_CLEANING);
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(u, r, "Failed to initiate cleaning: %m");
+ s->clean_result = SWAP_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int swap_can_clean(Unit *u, ExecCleanMask *ret) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int swap_can_start(Unit *u) {
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = {
+ [SWAP_EXEC_ACTIVATE] = "ExecActivate",
+ [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand);
+
+static const char* const swap_result_table[_SWAP_RESULT_MAX] = {
+ [SWAP_SUCCESS] = "success",
+ [SWAP_FAILURE_RESOURCES] = "resources",
+ [SWAP_FAILURE_TIMEOUT] = "timeout",
+ [SWAP_FAILURE_EXIT_CODE] = "exit-code",
+ [SWAP_FAILURE_SIGNAL] = "signal",
+ [SWAP_FAILURE_CORE_DUMP] = "core-dump",
+ [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult);
+
+const UnitVTable swap_vtable = {
+ .object_size = sizeof(Swap),
+ .exec_context_offset = offsetof(Swap, exec_context),
+ .cgroup_context_offset = offsetof(Swap, cgroup_context),
+ .kill_context_offset = offsetof(Swap, kill_context),
+ .exec_runtime_offset = offsetof(Swap, exec_runtime),
+ .dynamic_creds_offset = offsetof(Swap, dynamic_creds),
+
+ .sections =
+ "Unit\0"
+ "Swap\0"
+ "Install\0",
+ .private_section = "Swap",
+
+ .can_fail = true,
+
+ .init = swap_init,
+ .load = swap_load,
+ .done = swap_done,
+
+ .coldplug = swap_coldplug,
+
+ .dump = swap_dump,
+
+ .start = swap_start,
+ .stop = swap_stop,
+
+ .kill = swap_kill,
+ .clean = swap_clean,
+ .can_clean = swap_can_clean,
+
+ .get_timeout = swap_get_timeout,
+
+ .serialize = swap_serialize,
+ .deserialize_item = swap_deserialize_item,
+
+ .active_state = swap_active_state,
+ .sub_state_to_string = swap_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = swap_may_gc,
+ .is_extrinsic = swap_is_extrinsic,
+
+ .sigchld_event = swap_sigchld_event,
+
+ .reset_failed = swap_reset_failed,
+
+ .control_pid = swap_control_pid,
+
+ .bus_set_property = bus_swap_set_property,
+ .bus_commit_properties = bus_swap_commit_properties,
+
+ .following = swap_following,
+ .following_set = swap_following_set,
+
+ .enumerate = swap_enumerate,
+ .shutdown = swap_shutdown,
+ .supported = swap_supported,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Activating swap %s...",
+ [1] = "Deactivating swap %s...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Activated swap %s.",
+ [JOB_FAILED] = "Failed to activate swap %s.",
+ [JOB_TIMEOUT] = "Timed out activating swap %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Deactivated swap %s.",
+ [JOB_FAILED] = "Failed deactivating swap %s.",
+ [JOB_TIMEOUT] = "Timed out deactivating swap %s.",
+ },
+ },
+
+ .can_start = swap_can_start,
+};
diff --git a/src/core/swap.h b/src/core/swap.h
new file mode 100644
index 0000000..c0e3f11
--- /dev/null
+++ b/src/core/swap.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-device.h"
+#include "unit.h"
+
+typedef struct Swap Swap;
+
+typedef enum SwapExecCommand {
+ SWAP_EXEC_ACTIVATE,
+ SWAP_EXEC_DEACTIVATE,
+ _SWAP_EXEC_COMMAND_MAX,
+ _SWAP_EXEC_COMMAND_INVALID = -EINVAL,
+} SwapExecCommand;
+
+typedef enum SwapResult {
+ SWAP_SUCCESS,
+ SWAP_FAILURE_RESOURCES,
+ SWAP_FAILURE_TIMEOUT,
+ SWAP_FAILURE_EXIT_CODE,
+ SWAP_FAILURE_SIGNAL,
+ SWAP_FAILURE_CORE_DUMP,
+ SWAP_FAILURE_START_LIMIT_HIT,
+ _SWAP_RESULT_MAX,
+ _SWAP_RESULT_INVALID = -EINVAL,
+} SwapResult;
+
+typedef struct SwapParameters {
+ char *what;
+ char *options;
+ int priority;
+ bool priority_set;
+} SwapParameters;
+
+struct Swap {
+ Unit meta;
+
+ char *what;
+
+ /* If the device has already shown up, this is the device
+ * node, which might be different from what, due to
+ * symlinks */
+ char *devnode;
+
+ SwapParameters parameters_proc_swaps;
+ SwapParameters parameters_fragment;
+
+ bool from_proc_swaps:1;
+ bool from_fragment:1;
+
+ /* Used while looking for swaps that vanished or got added
+ * from/to /proc/swaps */
+ bool is_active:1;
+ bool just_activated:1;
+
+ SwapResult result;
+ SwapResult clean_result;
+
+ usec_t timeout_usec;
+
+ ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX];
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+ DynamicCreds dynamic_creds;
+
+ SwapState state, deserialized_state;
+
+ ExecCommand* control_command;
+ SwapExecCommand control_command_id;
+ pid_t control_pid;
+
+ sd_event_source *timer_event_source;
+
+ /* In order to be able to distinguish dependencies on
+ different device nodes we might end up creating multiple
+ devices for the same swap. We chain them up here. */
+
+ LIST_FIELDS(struct Swap, same_devnode);
+};
+
+extern const UnitVTable swap_vtable;
+
+int swap_process_device_new(Manager *m, sd_device *dev);
+int swap_process_device_remove(Manager *m, sd_device *dev);
+
+const char* swap_exec_command_to_string(SwapExecCommand i) _const_;
+SwapExecCommand swap_exec_command_from_string(const char *s) _pure_;
+
+const char* swap_result_to_string(SwapResult i) _const_;
+SwapResult swap_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SWAP, Swap);
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
new file mode 100644
index 0000000..a839d0a
--- /dev/null
+++ b/src/core/system.conf.in
@@ -0,0 +1,77 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file, or by creating "drop-ins" in
+# the system.conf.d/ subdirectory. The latter is generally recommended.
+# Defaults can be restored by simply deleting this file and all drop-ins.
+#
+# Use 'systemd-analyze cat-config systemd/system.conf' to display the full config.
+#
+# See systemd-system.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=journal-or-kmsg
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#DumpCore=yes
+#ShowStatus=yes
+#CrashChangeVT=no
+#CrashShell=no
+#CrashReboot=no
+#CtrlAltDelBurstAction=reboot-force
+#CPUAffinity=
+#NUMAPolicy=default
+#NUMAMask=
+#RuntimeWatchdogSec=off
+#RuntimeWatchdogPreSec=off
+#RuntimeWatchdogPreGovernor=
+#RebootWatchdogSec=10min
+#KExecWatchdogSec=off
+#WatchdogDevice=
+#CapabilityBoundingSet=
+#NoNewPrivileges=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=journal
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec=90s
+#DefaultTimeoutStopSec=90s
+#DefaultTimeoutAbortSec=
+#DefaultDeviceTimeoutSec=90s
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultCPUAccounting=yes
+#DefaultIOAccounting=no
+#DefaultIPAccounting=no
+#DefaultMemoryAccounting={{ 'yes' if MEMORY_ACCOUNTING_DEFAULT else 'no' }}
+#DefaultTasksAccounting=yes
+#DefaultTasksMax=15%
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=1024:{{HIGH_RLIMIT_NOFILE}}
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=8M
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultOOMPolicy=stop
+#DefaultSmackProcessLabel=
diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in
new file mode 100644
index 0000000..693433b
--- /dev/null
+++ b/src/core/systemd.pc.in
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+# Names with prefixes are preferred, and the run-together names should be
+# considered deprecated (though there is no plan to remove them). New names
+# shall have underscores.
+
+prefix=/usr
+root_prefix={{ROOTPREFIX_NOSLASH}}
+rootprefix=${root_prefix}
+sysconf_dir={{SYSCONF_DIR}}
+sysconfdir=${sysconf_dir}
+
+systemd_util_dir=${root_prefix}/lib/systemd
+systemdutildir=${systemd_util_dir}
+
+systemd_system_unit_dir=${rootprefix}/lib/systemd/system
+systemdsystemunitdir=${systemd_system_unit_dir}
+
+systemd_system_preset_dir=${rootprefix}/lib/systemd/system-preset
+systemdsystempresetdir=${systemd_system_preset_dir}
+
+systemd_user_unit_dir=${prefix}/lib/systemd/user
+systemduserunitdir=${systemd_user_unit_dir}
+
+systemd_user_preset_dir=${prefix}/lib/systemd/user-preset
+systemduserpresetdir=${systemd_user_preset_dir}
+
+systemd_system_conf_dir=${sysconfdir}/systemd/system
+systemdsystemconfdir=${systemd_system_conf_dir}
+
+systemd_user_conf_dir=${sysconfdir}/systemd/user
+systemduserconfdir=${systemd_user_conf_dir}
+
+systemd_system_unit_path=${systemd_system_conf_dir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemd_system_unit_dir}:/usr/lib/systemd/system:/lib/systemd/system
+systemdsystemunitpath=${systemd_system_unit_path}
+
+systemd_user_unit_path=${systemd_user_conf_dir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemd_user_unit_dir}:/usr/lib/systemd/user:/usr/share/systemd/user
+systemduserunitpath=${systemd_user_unit_path}
+
+systemd_system_generator_dir=${root_prefix}/lib/systemd/system-generators
+systemdsystemgeneratordir=${systemd_system_generator_dir}
+
+systemd_user_generator_dir=${prefix}/lib/systemd/user-generators
+systemdusergeneratordir=${systemd_user_generator_dir}
+
+systemd_system_generator_path=/run/systemd/system-generators:/etc/systemd/system-generators:/usr/local/lib/systemd/system-generators:${systemd_system_generator_dir}
+systemdsystemgeneratorpath=${systemd_system_generator_path}
+
+systemd_user_generator_path=/run/systemd/user-generators:/etc/systemd/user-generators:/usr/local/lib/systemd/user-generators:${systemd_user_generator_dir}
+systemdusergeneratorpath=${systemd_user_generator_path}
+
+systemd_sleep_dir=${root_prefix}/lib/systemd/system-sleep
+systemdsleepdir=${systemd_sleep_dir}
+
+systemd_shutdown_dir=${root_prefix}/lib/systemd/system-shutdown
+systemdshutdowndir=${systemd_shutdown_dir}
+
+tmpfiles_dir=${prefix}/lib/tmpfiles.d
+tmpfilesdir=${tmpfiles_dir}
+
+user_tmpfiles_dir=${prefix}/share/user-tmpfiles.d
+
+sysusers_dir=${rootprefix}/lib/sysusers.d
+sysusersdir=${sysusers_dir}
+
+sysctl_dir=${rootprefix}/lib/sysctl.d
+sysctldir=${sysctl_dir}
+
+binfmt_dir=${rootprefix}/lib/binfmt.d
+binfmtdir=${binfmt_dir}
+
+modules_load_dir=${rootprefix}/lib/modules-load.d
+modulesloaddir=${modules_load_dir}
+
+catalog_dir=${prefix}/lib/systemd/catalog
+catalogdir=${catalog_dir}
+
+system_uid_max={{SYSTEM_UID_MAX}}
+systemuidmax=${system_uid_max}
+system_gid_max={{SYSTEM_GID_MAX}}
+systemgidmax=${system_gid_max}
+
+dynamic_uid_min={{DYNAMIC_UID_MIN}}
+dynamicuidmin=${dynamic_uid_min}
+dynamic_uid_max={{DYNAMIC_UID_MAX}}
+dynamicuidmax=${dynamic_uid_max}
+
+container_uid_base_min={{CONTAINER_UID_BASE_MIN}}
+containeruidbasemin=${container_uid_base_min}
+container_uid_base_max={{CONTAINER_UID_BASE_MAX}}
+containeruidbasemax=${container_uid_base_max}
+
+Name: systemd
+Description: systemd System and Service Manager
+URL: {{PROJECT_URL}}
+Version: {{PROJECT_VERSION}}
diff --git a/src/core/target.c b/src/core/target.c
new file mode 100644
index 0000000..6225df5
--- /dev/null
+++ b/src/core/target.c
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "dbus-unit.h"
+#include "log.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-util.h"
+#include "target.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = {
+ [TARGET_DEAD] = UNIT_INACTIVE,
+ [TARGET_ACTIVE] = UNIT_ACTIVE
+};
+
+static void target_set_state(Target *t, TargetState state) {
+ TargetState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s",
+ UNIT(t)->id,
+ target_state_to_string(old_state),
+ target_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static int target_add_default_dependencies(Target *t) {
+ _cleanup_free_ Unit **others = NULL;
+ int r, n_others;
+
+ assert(t);
+
+ if (!UNIT(t)->default_dependencies)
+ return 0;
+
+ /* Imply ordering for requirement dependencies on target units. Note that when the user created a
+ * contradicting ordering manually we won't add anything in here to make sure we don't create a
+ * loop.
+ *
+ * Note that quite likely iterating through these dependencies will add new dependencies, which
+ * conflicts with the hashmap-based iteration logic. Hence, instead of iterating through the
+ * dependencies and acting on them as we go, first take an "atomic snapshot" of sorts and iterate
+ * through that. */
+
+ n_others = unit_get_dependency_array(UNIT(t), UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, &others);
+ if (n_others < 0)
+ return n_others;
+
+ for (int i = 0; i < n_others; i++) {
+ r = unit_add_default_target_dependency(others[i], UNIT(t));
+ if (r < 0)
+ return r;
+ }
+
+ if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET))
+ return 0;
+
+ /* Make sure targets are unloaded on shutdown */
+ return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int target_load(Unit *u) {
+ Target *t = TARGET(u);
+ int r;
+
+ assert(t);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ return target_add_default_dependencies(t);
+}
+
+static int target_coldplug(Unit *u) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(t->state == TARGET_DEAD);
+
+ if (t->deserialized_state != t->state)
+ target_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void target_dump(Unit *u, FILE *f, const char *prefix) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(f);
+
+ fprintf(f,
+ "%sTarget State: %s\n",
+ prefix, target_state_to_string(t->state));
+}
+
+static int target_start(Unit *u) {
+ Target *t = TARGET(u);
+ int r;
+
+ assert(t);
+ assert(t->state == TARGET_DEAD);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ target_set_state(t, TARGET_ACTIVE);
+ return 1;
+}
+
+static int target_stop(Unit *u) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(t->state == TARGET_ACTIVE);
+
+ target_set_state(t, TARGET_DEAD);
+ return 1;
+}
+
+static int target_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Target *s = TARGET(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", target_state_to_string(s->state));
+ return 0;
+}
+
+static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Target *s = TARGET(u);
+
+ assert(s);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ TargetState state;
+
+ state = target_state_from_string(value);
+ if (state < 0)
+ log_debug("Failed to parse state value %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else
+ log_debug("Unknown serialization key '%s'", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState target_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[TARGET(u)->state];
+}
+
+_pure_ static const char *target_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return target_state_to_string(TARGET(u)->state);
+}
+
+const UnitVTable target_vtable = {
+ .object_size = sizeof(Target),
+
+ .sections =
+ "Unit\0"
+ "Target\0"
+ "Install\0",
+
+ .can_fail = true,
+
+ .load = target_load,
+ .coldplug = target_coldplug,
+
+ .dump = target_dump,
+
+ .start = target_start,
+ .stop = target_stop,
+
+ .serialize = target_serialize,
+ .deserialize_item = target_deserialize_item,
+
+ .active_state = target_active_state,
+ .sub_state_to_string = target_sub_state_to_string,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Reached target %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Stopped target %s.",
+ },
+ },
+};
diff --git a/src/core/target.h b/src/core/target.h
new file mode 100644
index 0000000..bb909d6
--- /dev/null
+++ b/src/core/target.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Target Target;
+
+struct Target {
+ Unit meta;
+
+ TargetState state, deserialized_state;
+};
+
+extern const UnitVTable target_vtable;
+
+DEFINE_CAST(TARGET, Target);
diff --git a/src/core/timer.c b/src/core/timer.c
new file mode 100644
index 0000000..60e8fea
--- /dev/null
+++ b/src/core/timer.c
@@ -0,0 +1,1060 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "fs-util.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "timer.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = {
+ [TIMER_DEAD] = UNIT_INACTIVE,
+ [TIMER_WAITING] = UNIT_ACTIVE,
+ [TIMER_RUNNING] = UNIT_ACTIVE,
+ [TIMER_ELAPSED] = UNIT_ACTIVE,
+ [TIMER_FAILED] = UNIT_FAILED
+};
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata);
+
+static void timer_init(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+ t->next_elapse_realtime = USEC_INFINITY;
+ t->accuracy_usec = u->manager->default_timer_accuracy_usec;
+ t->remain_after_elapse = true;
+}
+
+void timer_free_values(Timer *t) {
+ TimerValue *v;
+
+ assert(t);
+
+ while ((v = t->values)) {
+ LIST_REMOVE(value, t->values, v);
+ calendar_spec_free(v->calendar_spec);
+ free(v);
+ }
+}
+
+static void timer_done(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+
+ timer_free_values(t);
+
+ t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+ t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+
+ t->stamp_path = mfree(t->stamp_path);
+}
+
+static int timer_verify(Timer *t) {
+ assert(t);
+ assert(UNIT(t)->load_state == UNIT_LOADED);
+
+ if (!t->values && !t->on_clock_change && !t->on_timezone_change)
+ return log_unit_error_errno(UNIT(t), SYNTHETIC_ERRNO(ENOEXEC), "Timer unit lacks value setting. Refusing.");
+
+ return 0;
+}
+
+static int timer_add_default_dependencies(Timer *t) {
+ int r;
+
+ assert(t);
+
+ if (!UNIT(t)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ if (v->base != TIMER_CALENDAR)
+ continue;
+
+ FOREACH_STRING(target, SPECIAL_TIME_SYNC_TARGET, SPECIAL_TIME_SET_TARGET) {
+ r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, target, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+ }
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int timer_add_trigger_dependencies(Timer *t) {
+ Unit *x;
+ int r;
+
+ assert(t);
+
+ if (UNIT_TRIGGER(UNIT(t)))
+ return 0;
+
+ r = unit_load_related_unit(UNIT(t), ".service", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int timer_setup_persistent(Timer *t) {
+ _cleanup_free_ char *stamp_path = NULL;
+ int r;
+
+ assert(t);
+
+ if (!t->persistent)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+
+ r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ stamp_path = strjoin("/var/lib/systemd/timers/stamp-", UNIT(t)->id);
+ } else {
+ const char *e;
+
+ e = getenv("XDG_DATA_HOME");
+ if (e)
+ stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id);
+ else {
+
+ _cleanup_free_ char *h = NULL;
+
+ r = get_home_dir(&h);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m");
+
+ stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id);
+ }
+ }
+
+ if (!stamp_path)
+ return log_oom();
+
+ return free_and_replace(t->stamp_path, stamp_path);
+}
+
+static uint64_t timer_get_fixed_delay_hash(Timer *t) {
+ static const uint8_t hash_key[] = {
+ 0x51, 0x0a, 0xdb, 0x76, 0x29, 0x51, 0x42, 0xc2,
+ 0x80, 0x35, 0xea, 0xe6, 0x8e, 0x3a, 0x37, 0xbd
+ };
+
+ struct siphash state;
+ sd_id128_t machine_id;
+ uid_t uid;
+ int r;
+
+ assert(t);
+
+ uid = getuid();
+ r = sd_id128_get_machine(&machine_id);
+ if (r < 0) {
+ log_unit_debug_errno(UNIT(t), r,
+ "Failed to get machine ID for the fixed delay calculation, proceeding with 0: %m");
+ machine_id = SD_ID128_NULL;
+ }
+
+ siphash24_init(&state, hash_key);
+ siphash24_compress(&machine_id, sizeof(sd_id128_t), &state);
+ siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state);
+ siphash24_compress(&uid, sizeof(uid_t), &state);
+ siphash24_compress_string(UNIT(t)->id, &state);
+
+ return siphash24_finalize(&state);
+}
+
+static int timer_load(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = timer_add_trigger_dependencies(t);
+ if (r < 0)
+ return r;
+
+ r = timer_setup_persistent(t);
+ if (r < 0)
+ return r;
+
+ r = timer_add_default_dependencies(t);
+ if (r < 0)
+ return r;
+
+ return timer_verify(t);
+}
+
+static void timer_dump(Unit *u, FILE *f, const char *prefix) {
+ Timer *t = TIMER(u);
+ Unit *trigger;
+
+ trigger = UNIT_TRIGGER(u);
+
+ fprintf(f,
+ "%sTimer State: %s\n"
+ "%sResult: %s\n"
+ "%sUnit: %s\n"
+ "%sPersistent: %s\n"
+ "%sWakeSystem: %s\n"
+ "%sAccuracy: %s\n"
+ "%sRemainAfterElapse: %s\n"
+ "%sFixedRandomDelay: %s\n"
+ "%sOnClockChange: %s\n"
+ "%sOnTimeZoneChange: %s\n",
+ prefix, timer_state_to_string(t->state),
+ prefix, timer_result_to_string(t->result),
+ prefix, trigger ? trigger->id : "n/a",
+ prefix, yes_no(t->persistent),
+ prefix, yes_no(t->wake_system),
+ prefix, FORMAT_TIMESPAN(t->accuracy_usec, 1),
+ prefix, yes_no(t->remain_after_elapse),
+ prefix, yes_no(t->fixed_random_delay),
+ prefix, yes_no(t->on_clock_change),
+ prefix, yes_no(t->on_timezone_change));
+
+ LIST_FOREACH(value, v, t->values)
+ if (v->base == TIMER_CALENDAR) {
+ _cleanup_free_ char *p = NULL;
+
+ (void) calendar_spec_to_string(v->calendar_spec, &p);
+
+ fprintf(f,
+ "%s%s: %s\n",
+ prefix,
+ timer_base_to_string(v->base),
+ strna(p));
+ } else
+ fprintf(f,
+ "%s%s: %s\n",
+ prefix,
+ timer_base_to_string(v->base),
+ FORMAT_TIMESPAN(v->value, 0));
+}
+
+static void timer_set_state(Timer *t, TimerState state) {
+ TimerState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != TIMER_WAITING) {
+ t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+ t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+ t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+ t->next_elapse_realtime = USEC_INFINITY;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], 0);
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change);
+
+static int timer_coldplug(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(t->state == TIMER_DEAD);
+
+ if (t->deserialized_state == t->state)
+ return 0;
+
+ if (t->deserialized_state == TIMER_WAITING)
+ timer_enter_waiting(t, false);
+ else
+ timer_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void timer_enter_dead(Timer *t, TimerResult f) {
+ assert(t);
+
+ if (t->result == TIMER_SUCCESS)
+ t->result = f;
+
+ unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result));
+ timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD);
+}
+
+static void timer_enter_elapsed(Timer *t, bool leave_around) {
+ assert(t);
+
+ /* If a unit is marked with RemainAfterElapse=yes we leave it
+ * around even after it elapsed once, so that starting it
+ * later again does not necessarily mean immediate
+ * retriggering. We unconditionally leave units with
+ * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around,
+ * since they might be restarted automatically at any time
+ * later on. */
+
+ if (t->remain_after_elapse || leave_around)
+ timer_set_state(t, TIMER_ELAPSED);
+ else
+ timer_enter_dead(t, TIMER_SUCCESS);
+}
+
+static void add_random(Timer *t, usec_t *v) {
+ usec_t add;
+
+ assert(t);
+ assert(v);
+
+ if (t->random_usec == 0)
+ return;
+ if (*v == USEC_INFINITY)
+ return;
+
+ add = (t->fixed_random_delay ? timer_get_fixed_delay_hash(t) : random_u64()) % t->random_usec;
+
+ if (*v + add < *v) /* overflow */
+ *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */
+ else
+ *v += add;
+
+ log_unit_debug(UNIT(t), "Adding %s random time.", FORMAT_TIMESPAN(add, 0));
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change) {
+ bool found_monotonic = false, found_realtime = false;
+ bool leave_around = false;
+ triple_timestamp ts;
+ Unit *trigger;
+ int r;
+
+ assert(t);
+
+ trigger = UNIT_TRIGGER(UNIT(t));
+ if (!trigger) {
+ log_unit_error(UNIT(t), "Unit to trigger vanished.");
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+ return;
+ }
+
+ triple_timestamp_get(&ts);
+ t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0;
+
+ LIST_FOREACH(value, v, t->values) {
+ if (v->disabled)
+ continue;
+
+ if (v->base == TIMER_CALENDAR) {
+ usec_t b, rebased;
+
+ /* If we know the last time this was
+ * triggered, schedule the job based relative
+ * to that. If we don't, just start from
+ * the activation time. */
+
+ if (dual_timestamp_is_set(&t->last_trigger))
+ b = t->last_trigger.realtime;
+ else if (dual_timestamp_is_set(&UNIT(t)->inactive_exit_timestamp))
+ b = UNIT(t)->inactive_exit_timestamp.realtime;
+ else
+ b = ts.realtime;
+
+ r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse);
+ if (r < 0)
+ continue;
+
+ /* To make the delay due to RandomizedDelaySec= work even at boot, if the scheduled
+ * time has already passed, set the time when systemd first started as the scheduled
+ * time. Note that we base this on the monotonic timestamp of the boot, not the
+ * realtime one, since the wallclock might have been off during boot. */
+ rebased = map_clock_usec(UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic,
+ CLOCK_MONOTONIC, CLOCK_REALTIME);
+ if (v->next_elapse < rebased)
+ v->next_elapse = rebased;
+
+ if (!found_realtime)
+ t->next_elapse_realtime = v->next_elapse;
+ else
+ t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse);
+
+ found_realtime = true;
+
+ } else {
+ usec_t base;
+
+ switch (v->base) {
+
+ case TIMER_ACTIVE:
+ if (state_translation_table[t->state] == UNIT_ACTIVE)
+ base = UNIT(t)->inactive_exit_timestamp.monotonic;
+ else
+ base = ts.monotonic;
+ break;
+
+ case TIMER_BOOT:
+ if (detect_container() <= 0) {
+ /* CLOCK_MONOTONIC equals the uptime on Linux */
+ base = 0;
+ break;
+ }
+ /* In a container we don't want to include the time the host
+ * was already up when the container started, so count from
+ * our own startup. */
+ _fallthrough_;
+ case TIMER_STARTUP:
+ base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+ break;
+
+ case TIMER_UNIT_ACTIVE:
+ leave_around = true;
+ base = MAX(trigger->inactive_exit_timestamp.monotonic, t->last_trigger.monotonic);
+ if (base <= 0)
+ continue;
+ break;
+
+ case TIMER_UNIT_INACTIVE:
+ leave_around = true;
+ base = MAX(trigger->inactive_enter_timestamp.monotonic, t->last_trigger.monotonic);
+ if (base <= 0)
+ continue;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value);
+
+ if (dual_timestamp_is_set(&t->last_trigger) &&
+ !time_change &&
+ v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) &&
+ IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) {
+ /* This is a one time trigger, disable it now */
+ v->disabled = true;
+ continue;
+ }
+
+ if (!found_monotonic)
+ t->next_elapse_monotonic_or_boottime = v->next_elapse;
+ else
+ t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse);
+
+ found_monotonic = true;
+ }
+ }
+
+ if (!found_monotonic && !found_realtime && !t->on_timezone_change && !t->on_clock_change) {
+ log_unit_debug(UNIT(t), "Timer is elapsed.");
+ timer_enter_elapsed(t, leave_around);
+ return;
+ }
+
+ if (found_monotonic) {
+ usec_t left;
+
+ add_random(t, &t->next_elapse_monotonic_or_boottime);
+
+ left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)));
+ log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", FORMAT_TIMESPAN(left, 0));
+
+ if (t->monotonic_event_source) {
+ r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ goto fail;
+ } else {
+
+ r = sd_event_add_time(
+ UNIT(t)->manager->event,
+ &t->monotonic_event_source,
+ t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC,
+ t->next_elapse_monotonic_or_boottime, t->accuracy_usec,
+ timer_dispatch, t);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic");
+ }
+
+ } else if (t->monotonic_event_source) {
+
+ r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ goto fail;
+ }
+
+ if (found_realtime) {
+ add_random(t, &t->next_elapse_realtime);
+
+ log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", FORMAT_TIMESTAMP(t->next_elapse_realtime));
+
+ if (t->realtime_event_source) {
+ r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ goto fail;
+ } else {
+ r = sd_event_add_time(
+ UNIT(t)->manager->event,
+ &t->realtime_event_source,
+ t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME,
+ t->next_elapse_realtime, t->accuracy_usec,
+ timer_dispatch, t);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime");
+ }
+
+ } else if (t->realtime_event_source) {
+
+ r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ goto fail;
+ }
+
+ timer_set_state(t, TIMER_WAITING);
+ return;
+
+fail:
+ log_unit_warning_errno(UNIT(t), r, "Failed to enter waiting state: %m");
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static void timer_enter_running(Timer *t) {
+ _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ Job *job;
+ int r;
+
+ assert(t);
+
+ /* Don't start job if we are supposed to go down */
+ if (unit_stop_pending(UNIT(t)))
+ return;
+
+ trigger = UNIT_TRIGGER(UNIT(t));
+ if (!trigger) {
+ log_unit_error(UNIT(t), "Unit to trigger vanished.");
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+ return;
+ }
+
+ details = activation_details_new(UNIT(t));
+ if (!details) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0)
+ goto fail;
+
+ dual_timestamp_get(&t->last_trigger);
+ ACTIVATION_DETAILS_TIMER(details)->last_trigger = t->last_trigger;
+
+ job_set_activation_details(job, details);
+
+ if (t->stamp_path)
+ touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID);
+
+ timer_set_state(t, TIMER_RUNNING);
+ return;
+
+fail:
+ log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static int timer_start(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+ assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED));
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ t->last_trigger = DUAL_TIMESTAMP_NULL;
+
+ /* Reenable all timers that depend on unit activation time */
+ LIST_FOREACH(value, v, t->values)
+ if (v->base == TIMER_ACTIVE)
+ v->disabled = false;
+
+ if (t->stamp_path) {
+ struct stat st;
+
+ if (stat(t->stamp_path, &st) >= 0) {
+ usec_t ft;
+
+ /* Load the file timestamp, but only if it is actually in the past. If it is in the future,
+ * something is wrong with the system clock. */
+
+ ft = timespec_load(&st.st_mtim);
+ if (ft < now(CLOCK_REALTIME))
+ t->last_trigger.realtime = ft;
+ else
+ log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.",
+ FORMAT_TIMESTAMP(ft));
+
+ } else if (errno == ENOENT)
+ /* The timer has never run before, make sure a stamp file exists. */
+ (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+ }
+
+ t->result = TIMER_SUCCESS;
+ timer_enter_waiting(t, false);
+ return 1;
+}
+
+static int timer_stop(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED));
+
+ timer_enter_dead(t, TIMER_SUCCESS);
+ return 1;
+}
+
+static int timer_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", timer_state_to_string(t->state));
+ (void) serialize_item(f, "result", timer_result_to_string(t->result));
+
+ if (t->last_trigger.realtime > 0)
+ (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime);
+
+ if (t->last_trigger.monotonic > 0)
+ (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic);
+
+ return 0;
+}
+
+static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ TimerState state;
+
+ state = timer_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ t->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ TimerResult f;
+
+ f = timer_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != TIMER_SUCCESS)
+ t->result = f;
+
+ } else if (streq(key, "last-trigger-realtime"))
+ (void) deserialize_usec(value, &t->last_trigger.realtime);
+ else if (streq(key, "last-trigger-monotonic"))
+ (void) deserialize_usec(value, &t->last_trigger.monotonic);
+ else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+_pure_ static UnitActiveState timer_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[TIMER(u)->state];
+}
+
+_pure_ static const char *timer_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return timer_state_to_string(TIMER(u)->state);
+}
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) {
+ Timer *t = TIMER(userdata);
+
+ assert(t);
+
+ if (t->state != TIMER_WAITING)
+ return 0;
+
+ log_unit_debug(UNIT(t), "Timer elapsed.");
+ timer_enter_running(t);
+ return 0;
+}
+
+static void timer_trigger_notify(Unit *u, Unit *other) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+ /* Reenable all timers that depend on unit state */
+ LIST_FOREACH(value, v, t->values)
+ if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE))
+ v->disabled = false;
+
+ switch (t->state) {
+
+ case TIMER_WAITING:
+ case TIMER_ELAPSED:
+
+ /* Recalculate sleep time */
+ timer_enter_waiting(t, false);
+ break;
+
+ case TIMER_RUNNING:
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ log_unit_debug(UNIT(t), "Got notified about unit deactivation.");
+ timer_enter_waiting(t, false);
+ }
+ break;
+
+ case TIMER_DEAD:
+ case TIMER_FAILED:
+ break;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static void timer_reset_failed(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+
+ if (t->state == TIMER_FAILED)
+ timer_set_state(t, TIMER_DEAD);
+
+ t->result = TIMER_SUCCESS;
+}
+
+static void timer_time_change(Unit *u) {
+ Timer *t = TIMER(u);
+ usec_t ts;
+
+ assert(u);
+
+ if (t->state != TIMER_WAITING)
+ return;
+
+ /* If we appear to have triggered in the future, the system clock must
+ * have been set backwards. So let's rewind our own clock and allow
+ * the future triggers to happen again :). Exactly the same as when
+ * you start a timer unit with Persistent=yes. */
+ ts = now(CLOCK_REALTIME);
+ if (t->last_trigger.realtime > ts)
+ t->last_trigger.realtime = ts;
+
+ if (t->on_clock_change) {
+ log_unit_debug(u, "Time change, triggering activation.");
+ timer_enter_running(t);
+ } else {
+ log_unit_debug(u, "Time change, recalculating next elapse.");
+ timer_enter_waiting(t, true);
+ }
+}
+
+static void timer_timezone_change(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+
+ if (t->state != TIMER_WAITING)
+ return;
+
+ if (t->on_timezone_change) {
+ log_unit_debug(u, "Timezone change, triggering activation.");
+ timer_enter_running(t);
+ } else {
+ log_unit_debug(u, "Timezone change, recalculating next elapse.");
+ timer_enter_waiting(t, false);
+ }
+}
+
+static int timer_clean(Unit *u, ExecCleanMask mask) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+ assert(mask != 0);
+
+ if (t->state != TIMER_DEAD)
+ return -EBUSY;
+
+ if (!IN_SET(mask, EXEC_CLEAN_STATE))
+ return -EUNATCH;
+
+ r = timer_setup_persistent(t);
+ if (r < 0)
+ return r;
+
+ if (!t->stamp_path)
+ return -EUNATCH;
+
+ if (unlink(t->stamp_path) && errno != ENOENT)
+ return log_unit_error_errno(u, errno, "Failed to clean stamp file of timer: %m");
+
+ return 0;
+}
+
+static int timer_can_clean(Unit *u, ExecCleanMask *ret) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+
+ *ret = t->persistent ? EXEC_CLEAN_STATE : 0;
+ return 0;
+}
+
+static int timer_can_start(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+
+ assert(details);
+ assert(f);
+ assert(t);
+
+ (void) serialize_dual_timestamp(f, "activation-details-timer-last-trigger", &t->last_trigger);
+}
+
+static int activation_details_timer_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ int r;
+
+ assert(key);
+ assert(value);
+
+ if (!details || !*details)
+ return -EINVAL;
+
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(*details);
+ if (!t)
+ return -EINVAL;
+
+ if (!streq(key, "activation-details-timer-last-trigger"))
+ return -EINVAL;
+
+ r = deserialize_dual_timestamp(value, &t->last_trigger);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(t);
+
+ if (!dual_timestamp_is_set(&t->last_trigger))
+ return 0;
+
+ r = strv_extendf(strv, "TRIGGER_TIMER_REALTIME_USEC=" USEC_FMT, t->last_trigger.realtime);
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, "TRIGGER_TIMER_MONOTONIC_USEC=" USEC_FMT, t->last_trigger.monotonic);
+ if (r < 0)
+ return r;
+
+ return 2; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(t);
+
+ if (!dual_timestamp_is_set(&t->last_trigger))
+ return 0;
+
+ r = strv_extend(strv, "trigger_timer_realtime_usec");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, USEC_FMT, t->last_trigger.realtime);
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, "trigger_timer_monotonic_usec");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, USEC_FMT, t->last_trigger.monotonic);
+ if (r < 0)
+ return r;
+
+ return 2; /* Return the number of pairs added to the env block */
+}
+
+static const char* const timer_base_table[_TIMER_BASE_MAX] = {
+ [TIMER_ACTIVE] = "OnActiveSec",
+ [TIMER_BOOT] = "OnBootSec",
+ [TIMER_STARTUP] = "OnStartupSec",
+ [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec",
+ [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec",
+ [TIMER_CALENDAR] = "OnCalendar"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase);
+
+static const char* const timer_result_table[_TIMER_RESULT_MAX] = {
+ [TIMER_SUCCESS] = "success",
+ [TIMER_FAILURE_RESOURCES] = "resources",
+ [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult);
+
+const UnitVTable timer_vtable = {
+ .object_size = sizeof(Timer),
+
+ .sections =
+ "Unit\0"
+ "Timer\0"
+ "Install\0",
+ .private_section = "Timer",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+
+ .init = timer_init,
+ .done = timer_done,
+ .load = timer_load,
+
+ .coldplug = timer_coldplug,
+
+ .dump = timer_dump,
+
+ .start = timer_start,
+ .stop = timer_stop,
+
+ .clean = timer_clean,
+ .can_clean = timer_can_clean,
+
+ .serialize = timer_serialize,
+ .deserialize_item = timer_deserialize_item,
+
+ .active_state = timer_active_state,
+ .sub_state_to_string = timer_sub_state_to_string,
+
+ .trigger_notify = timer_trigger_notify,
+
+ .reset_failed = timer_reset_failed,
+ .time_change = timer_time_change,
+ .timezone_change = timer_timezone_change,
+
+ .bus_set_property = bus_timer_set_property,
+
+ .can_start = timer_can_start,
+};
+
+const ActivationDetailsVTable activation_details_timer_vtable = {
+ .object_size = sizeof(ActivationDetailsTimer),
+
+ .serialize = activation_details_timer_serialize,
+ .deserialize = activation_details_timer_deserialize,
+ .append_env = activation_details_timer_append_env,
+ .append_pair = activation_details_timer_append_pair,
+};
diff --git a/src/core/timer.h b/src/core/timer.h
new file mode 100644
index 0000000..914e425
--- /dev/null
+++ b/src/core/timer.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Timer Timer;
+typedef struct ActivationDetailsTimer ActivationDetailsTimer;
+
+#include "calendarspec.h"
+#include "unit.h"
+
+typedef enum TimerBase {
+ TIMER_ACTIVE,
+ TIMER_BOOT,
+ TIMER_STARTUP,
+ TIMER_UNIT_ACTIVE,
+ TIMER_UNIT_INACTIVE,
+ TIMER_CALENDAR,
+ _TIMER_BASE_MAX,
+ _TIMER_BASE_INVALID = -EINVAL,
+} TimerBase;
+
+typedef struct TimerValue {
+ TimerBase base;
+ bool disabled;
+
+ usec_t value; /* only for monotonic events */
+ CalendarSpec *calendar_spec; /* only for calendar events */
+ usec_t next_elapse;
+
+ LIST_FIELDS(struct TimerValue, value);
+} TimerValue;
+
+typedef enum TimerResult {
+ TIMER_SUCCESS,
+ TIMER_FAILURE_RESOURCES,
+ TIMER_FAILURE_START_LIMIT_HIT,
+ _TIMER_RESULT_MAX,
+ _TIMER_RESULT_INVALID = -EINVAL,
+} TimerResult;
+
+struct Timer {
+ Unit meta;
+
+ usec_t accuracy_usec;
+ usec_t random_usec;
+
+ LIST_HEAD(TimerValue, values);
+ usec_t next_elapse_realtime;
+ usec_t next_elapse_monotonic_or_boottime;
+ dual_timestamp last_trigger;
+
+ TimerState state, deserialized_state;
+
+ sd_event_source *monotonic_event_source;
+ sd_event_source *realtime_event_source;
+
+ TimerResult result;
+
+ bool persistent;
+ bool wake_system;
+ bool remain_after_elapse;
+ bool on_clock_change;
+ bool on_timezone_change;
+ bool fixed_random_delay;
+
+ char *stamp_path;
+};
+
+struct ActivationDetailsTimer {
+ ActivationDetails meta;
+ dual_timestamp last_trigger;
+};
+
+#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC)
+
+void timer_free_values(Timer *t);
+
+extern const UnitVTable timer_vtable;
+extern const ActivationDetailsVTable activation_details_timer_vtable;
+
+const char *timer_base_to_string(TimerBase i) _const_;
+TimerBase timer_base_from_string(const char *s) _pure_;
+
+const char* timer_result_to_string(TimerResult i) _const_;
+TimerResult timer_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(TIMER, Timer);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_TIMER, ActivationDetailsTimer, TIMER);
diff --git a/src/core/transaction.c b/src/core/transaction.c
new file mode 100644
index 0000000..f8886fe
--- /dev/null
+++ b/src/core/transaction.c
@@ -0,0 +1,1200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "dbus-unit.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "transaction.h"
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies);
+
+static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) {
+ assert(tr);
+ assert(j);
+
+ /* Deletes one job from the transaction */
+
+ transaction_unlink_job(tr, j, delete_dependencies);
+
+ job_free(j);
+}
+
+static void transaction_delete_unit(Transaction *tr, Unit *u) {
+ Job *j;
+
+ /* Deletes all jobs associated with a certain unit from the
+ * transaction */
+
+ while ((j = hashmap_get(tr->jobs, u)))
+ transaction_delete_job(tr, j, true);
+}
+
+void transaction_abort(Transaction *tr) {
+ Job *j;
+
+ assert(tr);
+
+ while ((j = hashmap_first(tr->jobs)))
+ transaction_delete_job(tr, j, false);
+
+ assert(hashmap_isempty(tr->jobs));
+}
+
+static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) {
+ assert(j);
+
+ /* A recursive sweep through the graph that marks all units
+ * that matter to the anchor job, i.e. are directly or
+ * indirectly a dependency of the anchor job via paths that
+ * are fully marked as mattering. */
+
+ j->matters_to_anchor = true;
+ j->generation = generation;
+
+ LIST_FOREACH(subject, l, j->subject_list) {
+
+ /* This link does not matter */
+ if (!l->matters)
+ continue;
+
+ /* This unit has already been marked */
+ if (l->object->generation == generation)
+ continue;
+
+ transaction_find_jobs_that_matter_to_anchor(l->object, generation);
+ }
+}
+
+static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) {
+ JobDependency *last;
+
+ assert(j);
+ assert(other);
+ assert(j->unit == other->unit);
+ assert(!j->installed);
+
+ /* Merges 'other' into 'j' and then deletes 'other'. */
+
+ j->type = t;
+ j->state = JOB_WAITING;
+ j->irreversible = j->irreversible || other->irreversible;
+ j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor;
+
+ /* Patch us in as new owner of the JobDependency objects */
+ last = NULL;
+ LIST_FOREACH(subject, l, other->subject_list) {
+ assert(l->subject == other);
+ l->subject = j;
+ last = l;
+ }
+
+ /* Merge both lists */
+ if (last) {
+ last->subject_next = j->subject_list;
+ if (j->subject_list)
+ j->subject_list->subject_prev = last;
+ j->subject_list = other->subject_list;
+ }
+
+ /* Patch us in as new owner of the JobDependency objects */
+ last = NULL;
+ LIST_FOREACH(object, l, other->object_list) {
+ assert(l->object == other);
+ l->object = j;
+ last = l;
+ }
+
+ /* Merge both lists */
+ if (last) {
+ last->object_next = j->object_list;
+ if (j->object_list)
+ j->object_list->object_prev = last;
+ j->object_list = other->object_list;
+ }
+
+ /* Kill the other job */
+ other->subject_list = NULL;
+ other->object_list = NULL;
+ transaction_delete_job(tr, other, true);
+}
+
+_pure_ static bool job_is_conflicted_by(Job *j) {
+ assert(j);
+
+ /* Returns true if this job is pulled in by a least one
+ * ConflictedBy dependency. */
+
+ LIST_FOREACH(object, l, j->object_list)
+ if (l->conflicts)
+ return true;
+
+ return false;
+}
+
+static int delete_one_unmergeable_job(Transaction *tr, Job *job) {
+ assert(job);
+
+ /* Tries to delete one item in the linked list
+ * j->transaction_next->transaction_next->... that conflicts
+ * with another one, in an attempt to make an inconsistent
+ * transaction work. */
+
+ /* We rely here on the fact that if a merged with b does not
+ * merge with c, either a or b merge with c neither */
+ LIST_FOREACH(transaction, j, job)
+ LIST_FOREACH(transaction, k, j->transaction_next) {
+ Job *d;
+
+ /* Is this one mergeable? Then skip it */
+ if (job_type_is_mergeable(j->type, k->type))
+ continue;
+
+ /* Ok, we found two that conflict, let's see if we can
+ * drop one of them */
+ if (!j->matters_to_anchor && !k->matters_to_anchor) {
+
+ /* Both jobs don't matter, so let's
+ * find the one that is smarter to
+ * remove. Let's think positive and
+ * rather remove stops then starts --
+ * except if something is being
+ * stopped because it is conflicted by
+ * another unit in which case we
+ * rather remove the start. */
+
+ log_unit_debug(j->unit,
+ "Looking at job %s/%s conflicted_by=%s",
+ j->unit->id, job_type_to_string(j->type),
+ yes_no(j->type == JOB_STOP && job_is_conflicted_by(j)));
+ log_unit_debug(k->unit,
+ "Looking at job %s/%s conflicted_by=%s",
+ k->unit->id, job_type_to_string(k->type),
+ yes_no(k->type == JOB_STOP && job_is_conflicted_by(k)));
+
+ if (j->type == JOB_STOP) {
+
+ if (job_is_conflicted_by(j))
+ d = k;
+ else
+ d = j;
+
+ } else if (k->type == JOB_STOP) {
+
+ if (job_is_conflicted_by(k))
+ d = j;
+ else
+ d = k;
+ } else
+ d = j;
+
+ } else if (!j->matters_to_anchor)
+ d = j;
+ else if (!k->matters_to_anchor)
+ d = k;
+ else
+ return -ENOEXEC;
+
+ /* Ok, we can drop one, so let's do so. */
+ log_unit_debug(d->unit,
+ "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ k->unit->id, job_type_to_string(k->type),
+ d->unit->id, job_type_to_string(d->type));
+ transaction_delete_job(tr, d, true);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) {
+ Job *j;
+ int r;
+
+ assert(tr);
+
+ /* First step, check whether any of the jobs for one specific
+ * task conflict. If so, try to drop one of them. */
+ HASHMAP_FOREACH(j, tr->jobs) {
+ JobType t;
+
+ t = j->type;
+ LIST_FOREACH(transaction, k, j->transaction_next) {
+ if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0)
+ continue;
+
+ /* OK, we could not merge all jobs for this
+ * action. Let's see if we can get rid of one
+ * of them */
+
+ r = delete_one_unmergeable_job(tr, j);
+ if (r >= 0)
+ /* Ok, we managed to drop one, now
+ * let's ask our callers to call us
+ * again after garbage collecting */
+ return -EAGAIN;
+
+ /* We couldn't merge anything. Failure */
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING,
+ "Transaction contains conflicting jobs '%s' and '%s' for %s. "
+ "Probably contradicting requirement dependencies configured.",
+ job_type_to_string(t),
+ job_type_to_string(k->type),
+ k->unit->id);
+ }
+ }
+
+ /* Second step, merge the jobs. */
+ HASHMAP_FOREACH(j, tr->jobs) {
+ JobType t = j->type;
+
+ /* Merge all transaction jobs for j->unit */
+ LIST_FOREACH(transaction, k, j->transaction_next)
+ assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0);
+
+ Job *k;
+ while ((k = j->transaction_next)) {
+ if (tr->anchor_job == k) {
+ transaction_merge_and_delete_job(tr, k, j, t);
+ j = k;
+ } else
+ transaction_merge_and_delete_job(tr, j, k, t);
+ }
+
+ assert(!j->transaction_next);
+ assert(!j->transaction_prev);
+ }
+
+ return 0;
+}
+
+static void transaction_drop_redundant(Transaction *tr) {
+ bool again;
+
+ /* Goes through the transaction and removes all jobs of the units whose jobs are all noops. If not
+ * all of a unit's jobs are redundant, they are kept. */
+
+ assert(tr);
+
+ do {
+ Job *j;
+
+ again = false;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ bool keep = false;
+
+ LIST_FOREACH(transaction, k, j)
+ if (tr->anchor_job == k ||
+ !job_type_is_redundant(k->type, unit_active_state(k->unit)) ||
+ (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) {
+ keep = true;
+ break;
+ }
+
+ if (!keep) {
+ log_trace("Found redundant job %s/%s, dropping from transaction.",
+ j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, j, false);
+ again = true;
+ break;
+ }
+ }
+ } while (again);
+}
+
+_pure_ static bool unit_matters_to_anchor(Unit *u, Job *job) {
+ assert(u);
+ assert(job);
+ assert(!job->transaction_prev);
+
+ /* Checks whether at least one of the jobs for this unit
+ * matters to the anchor. */
+
+ LIST_FOREACH(transaction, j, job)
+ if (j->matters_to_anchor)
+ return true;
+
+ return false;
+}
+
+static char* merge_unit_ids(const char* unit_log_field, char * const* pairs) {
+ _cleanup_free_ char *ans = NULL;
+ size_t size = 0;
+
+ STRV_FOREACH_PAIR(unit_id, job_type, pairs) {
+ size_t next;
+
+ if (size > 0)
+ ans[size - 1] = '\n';
+
+ next = strlen(unit_log_field) + strlen(*unit_id);
+ if (!GREEDY_REALLOC(ans, size + next + 1))
+ return NULL;
+
+ sprintf(ans + size, "%s%s", unit_log_field, *unit_id);
+ size += next + 1;
+ }
+
+ if (!ans)
+ return strdup("");
+
+ return TAKE_PTR(ans);
+}
+
+static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) {
+
+ static const UnitDependencyAtom directions[] = {
+ UNIT_ATOM_BEFORE,
+ UNIT_ATOM_AFTER,
+ };
+
+ int r;
+
+ assert(tr);
+ assert(j);
+ assert(!j->transaction_prev);
+
+ /* Does a recursive sweep through the ordering graph, looking for a cycle. If we find a cycle we try
+ * to break it. */
+
+ /* Have we seen this before? */
+ if (j->generation == generation) {
+ Job *k, *delete = NULL;
+ _cleanup_free_ char **array = NULL, *unit_ids = NULL;
+
+ /* If the marker is NULL we have been here already and decided the job was loop-free from
+ * here. Hence shortcut things and return right-away. */
+ if (!j->marker)
+ return 0;
+
+ /* So, the marker is not NULL and we already have been here. We have a cycle. Let's try to
+ * break it. We go backwards in our path and try to find a suitable job to remove. We use the
+ * marker to find our way back, since smart how we are we stored our way back in there. */
+ for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) {
+
+ /* For logging below */
+ if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0)
+ log_oom();
+
+ if (!delete && hashmap_get(tr->jobs, k->unit) && !unit_matters_to_anchor(k->unit, k))
+ /* Ok, we can drop this one, so let's do so. */
+ delete = k;
+
+ /* Check if this in fact was the beginning of the cycle */
+ if (k == j)
+ break;
+ }
+
+ unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */
+
+ STRV_FOREACH_PAIR(unit_id, job_type, array)
+ /* logging for j not k here to provide a consistent narrative */
+ log_struct(LOG_WARNING,
+ LOG_UNIT_MESSAGE(j->unit,
+ "Found %s on %s/%s",
+ unit_id == array ? "ordering cycle" : "dependency",
+ *unit_id, *job_type),
+ "%s", strna(unit_ids));
+
+ if (delete) {
+ const char *status;
+ /* logging for j not k here to provide a consistent narrative */
+ log_struct(LOG_ERR,
+ LOG_UNIT_MESSAGE(j->unit,
+ "Job %s/%s deleted to break ordering cycle starting with %s/%s",
+ delete->unit->id, job_type_to_string(delete->type),
+ j->unit->id, job_type_to_string(j->type)),
+ "%s", strna(unit_ids));
+
+ if (log_get_show_color())
+ status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL;
+ else
+ status = " SKIP ";
+
+ unit_status_printf(delete->unit,
+ STATUS_TYPE_NOTICE,
+ status,
+ "Ordering cycle found, skipping %s",
+ unit_status_string(delete->unit, NULL));
+ transaction_delete_unit(tr, delete->unit);
+ return -EAGAIN;
+ }
+
+ log_struct(LOG_ERR,
+ LOG_UNIT_MESSAGE(j->unit, "Unable to break cycle starting with %s/%s",
+ j->unit->id, job_type_to_string(j->type)),
+ "%s", strna(unit_ids));
+
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC,
+ "Transaction order is cyclic. See system logs for details.");
+ }
+
+ /* Make the marker point to where we come from, so that we can
+ * find our way backwards if we want to break a cycle. We use
+ * a special marker for the beginning: we point to
+ * ourselves. */
+ j->marker = from ? from : j;
+ j->generation = generation;
+
+ /* Actual ordering of jobs depends on the unit ordering dependency and job types. We need to traverse
+ * the graph over 'before' edges in the actual job execution order. We traverse over both unit
+ * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job
+ * execution ordering. */
+ for (size_t d = 0; d < ELEMENTSOF(directions); d++) {
+ Unit *u;
+
+ UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) {
+ Job *o;
+
+ /* Is there a job for this unit? */
+ o = hashmap_get(tr->jobs, u);
+ if (!o) {
+ /* Ok, there is no job for this in the transaction, but maybe there is
+ * already one running? */
+ o = u->job;
+ if (!o)
+ continue;
+ }
+
+ /* Cut traversing if the job j is not really *before* o. */
+ if (job_compare(j, o, directions[d]) >= 0)
+ continue;
+
+ r = transaction_verify_order_one(tr, o, j, generation, e);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Ok, let's backtrack, and remember that this entry is not on
+ * our path anymore. */
+ j->marker = NULL;
+
+ return 0;
+}
+
+static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) {
+ Job *j;
+ int r;
+ unsigned g;
+
+ assert(tr);
+ assert(generation);
+
+ /* Check if the ordering graph is cyclic. If it is, try to fix
+ * that up by dropping one of the jobs. */
+
+ g = (*generation)++;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ r = transaction_verify_order_one(tr, j, NULL, g, e);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void transaction_collect_garbage(Transaction *tr) {
+ bool again;
+
+ assert(tr);
+
+ /* Drop jobs that are not required by any other job */
+
+ do {
+ Job *j;
+
+ again = false;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ if (tr->anchor_job == j)
+ continue;
+
+ if (!j->object_list) {
+ log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, j, true);
+ again = true;
+ break;
+ }
+
+ log_trace("Keeping job %s/%s because of %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ j->object_list->subject ? j->object_list->subject->unit->id : "root",
+ j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root");
+ }
+
+ } while (again);
+}
+
+static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) {
+ Job *j;
+
+ assert(tr);
+
+ /* Checks whether applying this transaction means that
+ * existing jobs would be replaced */
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+
+ /* Assume merged */
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+
+ if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) &&
+ job_type_is_conflicting(j->unit->job->type, j->type))
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE,
+ "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).",
+ tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type),
+ j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type));
+ }
+
+ return 0;
+}
+
+static void transaction_minimize_impact(Transaction *tr) {
+ Job *head;
+
+ assert(tr);
+
+ /* Drops all unnecessary jobs that reverse already active jobs
+ * or that stop a running service. */
+
+rescan:
+ HASHMAP_FOREACH(head, tr->jobs) {
+ LIST_FOREACH(transaction, j, head) {
+ bool stops_running_service, changes_existing_job;
+
+ /* If it matters, we shouldn't drop it */
+ if (j->matters_to_anchor)
+ continue;
+
+ /* Would this stop a running service?
+ * Would this change an existing job?
+ * If so, let's drop this entry */
+
+ stops_running_service =
+ j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit));
+
+ changes_existing_job =
+ j->unit->job &&
+ job_type_is_conflicting(j->type, j->unit->job->type);
+
+ if (!stops_running_service && !changes_existing_job)
+ continue;
+
+ if (stops_running_service)
+ log_unit_debug(j->unit,
+ "%s/%s would stop a running service.",
+ j->unit->id, job_type_to_string(j->type));
+
+ if (changes_existing_job)
+ log_unit_debug(j->unit,
+ "%s/%s would change existing job.",
+ j->unit->id, job_type_to_string(j->type));
+
+ /* Ok, let's get rid of this */
+ log_unit_debug(j->unit,
+ "Deleting %s/%s to minimize impact.",
+ j->unit->id, job_type_to_string(j->type));
+
+ transaction_delete_job(tr, j, true);
+ goto rescan;
+ }
+ }
+}
+
+static int transaction_apply(
+ Transaction *tr,
+ Manager *m,
+ JobMode mode,
+ Set *affected_jobs) {
+
+ Job *j;
+ int r;
+
+ /* Moves the transaction jobs to the set of active jobs */
+
+ if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) {
+
+ /* When isolating first kill all installed jobs which
+ * aren't part of the new transaction */
+ HASHMAP_FOREACH(j, m->jobs) {
+ assert(j->installed);
+
+ if (j->unit->ignore_on_isolate)
+ continue;
+
+ if (hashmap_get(tr->jobs, j->unit))
+ continue;
+
+ /* Not invalidating recursively. Avoids triggering
+ * OnFailure= actions of dependent jobs. Also avoids
+ * invalidating our iterator. */
+ job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+ }
+ }
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ /* Assume merged */
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+
+ r = hashmap_ensure_put(&m->jobs, NULL, UINT32_TO_PTR(j->id), j);
+ if (r < 0)
+ goto rollback;
+ }
+
+ while ((j = hashmap_steal_first(tr->jobs))) {
+ Job *installed_job;
+
+ /* Clean the job dependencies */
+ transaction_unlink_job(tr, j, false);
+
+ installed_job = job_install(j);
+ if (installed_job != j) {
+ /* j has been merged into a previously installed job */
+ if (tr->anchor_job == j)
+ tr->anchor_job = installed_job;
+ hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+ job_free(j);
+ j = installed_job;
+ }
+
+ job_add_to_run_queue(j);
+ job_add_to_dbus_queue(j);
+ job_start_timer(j, false);
+ job_shutdown_magic(j);
+
+ /* When 'affected' is specified, let's track all in it all jobs that were touched because of
+ * this transaction. */
+ if (affected_jobs)
+ (void) set_put(affected_jobs, j);
+ }
+
+ return 0;
+
+rollback:
+
+ HASHMAP_FOREACH(j, tr->jobs)
+ hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+
+ return r;
+}
+
+int transaction_activate(
+ Transaction *tr,
+ Manager *m,
+ JobMode mode,
+ Set *affected_jobs,
+ sd_bus_error *e) {
+
+ Job *j;
+ int r;
+ unsigned generation = 1;
+
+ assert(tr);
+
+ /* This applies the changes recorded in tr->jobs to
+ * the actual list of jobs, if possible. */
+
+ /* Reset the generation counter of all installed jobs. The detection of cycles
+ * looks at installed jobs. If they had a non-zero generation from some previous
+ * walk of the graph, the algorithm would break. */
+ HASHMAP_FOREACH(j, m->jobs)
+ j->generation = 0;
+
+ /* First step: figure out which jobs matter */
+ transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++);
+
+ /* Second step: Try not to stop any running services if
+ * we don't have to. Don't try to reverse running
+ * jobs if we don't have to. */
+ if (mode == JOB_FAIL)
+ transaction_minimize_impact(tr);
+
+ /* Third step: Drop redundant jobs */
+ transaction_drop_redundant(tr);
+
+ for (;;) {
+ /* Fourth step: Let's remove unneeded jobs that might
+ * be lurking. */
+ if (mode != JOB_ISOLATE)
+ transaction_collect_garbage(tr);
+
+ /* Fifth step: verify order makes sense and correct
+ * cycles if necessary and possible */
+ r = transaction_verify_order(tr, &generation, e);
+ if (r >= 0)
+ break;
+
+ if (r != -EAGAIN)
+ return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r));
+
+ /* Let's see if the resulting transaction ordering
+ * graph is still cyclic... */
+ }
+
+ for (;;) {
+ /* Sixth step: let's drop unmergeable entries if
+ * necessary and possible, merge entries we can
+ * merge */
+ r = transaction_merge_jobs(tr, e);
+ if (r >= 0)
+ break;
+
+ if (r != -EAGAIN)
+ return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r));
+
+ /* Seventh step: an entry got dropped, let's garbage
+ * collect its dependencies. */
+ if (mode != JOB_ISOLATE)
+ transaction_collect_garbage(tr);
+
+ /* Let's see if the resulting transaction still has
+ * unmergeable entries ... */
+ }
+
+ /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */
+ transaction_drop_redundant(tr);
+
+ /* Ninth step: check whether we can actually apply this */
+ r = transaction_is_destructive(tr, mode, e);
+ if (r < 0)
+ return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r));
+
+ /* Tenth step: apply changes */
+ r = transaction_apply(tr, m, mode, affected_jobs);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to apply transaction: %m");
+
+ assert(hashmap_isempty(tr->jobs));
+
+ if (!hashmap_isempty(m->jobs)) {
+ /* Are there any jobs now? Then make sure we have the
+ * idle pipe around. We don't really care too much
+ * whether this works or not, as the idle pipe is a
+ * feature for cosmetics, not actually useful for
+ * anything beyond that. */
+
+ if (m->idle_pipe[0] < 0 && m->idle_pipe[1] < 0 &&
+ m->idle_pipe[2] < 0 && m->idle_pipe[3] < 0) {
+ (void) pipe2(m->idle_pipe, O_NONBLOCK|O_CLOEXEC);
+ (void) pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC);
+ }
+ }
+
+ return 0;
+}
+
+static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) {
+ Job *j, *f;
+
+ assert(tr);
+ assert(unit);
+
+ /* Looks for an existing prospective job and returns that. If
+ * it doesn't exist it is created and added to the prospective
+ * jobs list. */
+
+ f = hashmap_get(tr->jobs, unit);
+
+ LIST_FOREACH(transaction, i, f) {
+ assert(i->unit == unit);
+
+ if (i->type == type) {
+ if (is_new)
+ *is_new = false;
+ return i;
+ }
+ }
+
+ j = job_new(unit, type);
+ if (!j)
+ return NULL;
+
+ j->generation = 0;
+ j->marker = NULL;
+ j->matters_to_anchor = false;
+ j->irreversible = tr->irreversible;
+
+ LIST_PREPEND(transaction, f, j);
+
+ if (hashmap_replace(tr->jobs, unit, f) < 0) {
+ LIST_REMOVE(transaction, f, j);
+ job_free(j);
+ return NULL;
+ }
+
+ if (is_new)
+ *is_new = true;
+
+ log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type));
+
+ return j;
+}
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) {
+ assert(tr);
+ assert(j);
+
+ if (j->transaction_prev)
+ j->transaction_prev->transaction_next = j->transaction_next;
+ else if (j->transaction_next)
+ hashmap_replace(tr->jobs, j->unit, j->transaction_next);
+ else
+ hashmap_remove_value(tr->jobs, j->unit, j);
+
+ if (j->transaction_next)
+ j->transaction_next->transaction_prev = j->transaction_prev;
+
+ j->transaction_prev = j->transaction_next = NULL;
+
+ while (j->subject_list)
+ job_dependency_free(j->subject_list);
+
+ while (j->object_list) {
+ Job *other = j->object_list->matters ? j->object_list->subject : NULL;
+
+ job_dependency_free(j->object_list);
+
+ if (other && delete_dependencies) {
+ log_unit_debug(other->unit,
+ "Deleting job %s/%s as dependency of job %s/%s",
+ other->unit->id, job_type_to_string(other->type),
+ j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, other, delete_dependencies);
+ }
+ }
+}
+
+void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e) {
+ JobType nt;
+ Unit *dep;
+ int r;
+
+ assert(tr);
+ assert(unit);
+
+ UNIT_FOREACH_DEPENDENCY(dep, unit, UNIT_ATOM_PROPAGATES_RELOAD_TO) {
+ nt = job_type_collapse(JOB_TRY_RELOAD, dep);
+ if (nt == JOB_NOP)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, nt, dep, by, false, false, false, ignore_order, e);
+ if (r < 0) {
+ log_unit_warning(dep,
+ "Cannot add dependency reload job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+}
+
+int transaction_add_job_and_dependencies(
+ Transaction *tr,
+ JobType type,
+ Unit *unit,
+ Job *by,
+ bool matters,
+ bool conflicts,
+ bool ignore_requirements,
+ bool ignore_order,
+ sd_bus_error *e) {
+
+ bool is_new;
+ Unit *dep;
+ Job *ret;
+ int r;
+
+ assert(tr);
+ assert(type < _JOB_TYPE_MAX);
+ assert(type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(unit);
+
+ /* Before adding jobs for this unit, let's ensure that its state has been loaded
+ * This matters when jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()).
+ * This way, we "recursively" coldplug units, ensuring that we do not look at state of
+ * not-yet-coldplugged units. */
+ if (MANAGER_IS_RELOADING(unit->manager))
+ unit_coldplug(unit);
+
+ if (by)
+ log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type));
+
+ /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set
+ * temporarily. */
+ if (!UNIT_IS_LOAD_COMPLETE(unit->load_state))
+ return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id);
+
+ if (type != JOB_STOP) {
+ r = bus_unit_validate_load_state(unit, e);
+ /* The time-based cache allows to start new units without daemon-reload,
+ * but if they are already referenced (because of dependencies or ordering)
+ * then we have to force a load of the fragment. As an optimization, check
+ * first if anything in the usual paths was modified since the last time
+ * the cache was loaded. Also check if the last time an attempt to load the
+ * unit was made was before the most recent cache refresh, so that we know
+ * we need to try again — even if the cache is current, it might have been
+ * updated in a different context before we had a chance to retry loading
+ * this particular unit.
+ *
+ * Given building up the transaction is a synchronous operation, attempt
+ * to load the unit immediately. */
+ if (r < 0 && manager_unit_cache_should_retry_load(unit)) {
+ sd_bus_error_free(e);
+ unit->load_state = UNIT_STUB;
+ r = unit_load(unit);
+ if (r < 0 || unit->load_state == UNIT_STUB)
+ unit->load_state = UNIT_NOT_FOUND;
+ r = bus_unit_validate_load_state(unit, e);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ if (!unit_job_is_applicable(unit, type))
+ return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE,
+ "Job type %s is not applicable for unit %s.",
+ job_type_to_string(type), unit->id);
+
+ /* First add the job. */
+ ret = transaction_add_one_job(tr, type, unit, &is_new);
+ if (!ret)
+ return -ENOMEM;
+
+ ret->ignore_order = ret->ignore_order || ignore_order;
+
+ /* Then, add a link to the job. */
+ if (by) {
+ if (!job_dependency_new(by, ret, matters, conflicts))
+ return -ENOMEM;
+ } else {
+ /* If the job has no parent job, it is the anchor job. */
+ assert(!tr->anchor_job);
+ tr->anchor_job = ret;
+ }
+
+ if (is_new && !ignore_requirements && type != JOB_NOP) {
+ Set *following;
+
+ /* If we are following some other unit, make sure we
+ * add all dependencies of everybody following. */
+ if (unit_following_set(ret->unit, &following) > 0) {
+ SET_FOREACH(dep, following) {
+ r = transaction_add_job_and_dependencies(tr, type, dep, ret, false, false, false, ignore_order, e);
+ if (r < 0) {
+ log_unit_full_errno(dep, r == -ERFKILL ? LOG_INFO : LOG_WARNING, r,
+ "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+
+ set_free(following);
+ }
+
+ /* Finally, recursively add in all dependencies. */
+ if (IN_SET(type, JOB_START, JOB_RESTART)) {
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START) {
+ r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, true, false, false, ignore_order, e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START_IGNORED) {
+ r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, false, false, false, ignore_order, e);
+ if (r < 0) {
+ /* unit masked, job type not applicable and unit not found are not considered as errors. */
+ log_unit_full_errno(dep,
+ IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING,
+ r, "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_VERIFY) {
+ r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, true, false, false, ignore_order, e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP) {
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, true, true, false, ignore_order, e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP_IGNORED) {
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, false, false, false, ignore_order, e);
+ if (r < 0) {
+ log_unit_warning(dep,
+ "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+ }
+
+ if (IN_SET(type, JOB_STOP, JOB_RESTART)) {
+ UnitDependencyAtom atom;
+ JobType ptype;
+
+ /* We propagate STOP as STOP, but RESTART only as TRY_RESTART, in order not to start
+ * dependencies that are not around. */
+ if (type == JOB_RESTART) {
+ atom = UNIT_ATOM_PROPAGATE_RESTART;
+ ptype = JOB_TRY_RESTART;
+ } else {
+ ptype = JOB_STOP;
+ atom = UNIT_ATOM_PROPAGATE_STOP;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, atom) {
+ JobType nt;
+
+ nt = job_type_collapse(ptype, dep);
+ if (nt == JOB_NOP)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, nt, dep, ret, true, false, false, ignore_order, e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+ }
+
+ if (type == JOB_RELOAD)
+ transaction_add_propagate_reload_jobs(tr, ret->unit, ret, ignore_order, e);
+
+ /* JOB_VERIFY_ACTIVE requires no dependency handling */
+ }
+
+ return 0;
+
+fail:
+ return r;
+}
+
+static bool shall_stop_on_isolate(Transaction *tr, Unit *u) {
+ assert(tr);
+ assert(u);
+
+ if (u->ignore_on_isolate)
+ return false;
+
+ /* Is there already something listed for this? */
+ if (hashmap_get(tr->jobs, u))
+ return false;
+
+ return true;
+}
+
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m) {
+ Unit *u;
+ char *k;
+ int r;
+
+ assert(tr);
+ assert(m);
+
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ Unit *o;
+
+ /* Ignore aliases */
+ if (u->id != k)
+ continue;
+
+ /* No need to stop inactive units */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job)
+ continue;
+
+ if (!shall_stop_on_isolate(tr, u))
+ continue;
+
+ /* Keep units that are triggered by units we want to keep around. */
+ bool keep = false;
+ UNIT_FOREACH_DEPENDENCY(o, u, UNIT_ATOM_TRIGGERED_BY)
+ if (!shall_stop_on_isolate(tr, o)) {
+ keep = true;
+ break;
+ }
+ if (keep)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, true, false, false, false, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %m");
+ }
+
+ return 0;
+}
+
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u) {
+ Unit *trigger;
+ int r;
+
+ assert(tr);
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(trigger, u, UNIT_ATOM_TRIGGERED_BY) {
+
+ /* No need to stop inactive jobs */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger)) && !trigger->job)
+ continue;
+
+ /* Is there already something listed for this? */
+ if (hashmap_get(tr->jobs, trigger))
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, trigger, tr->anchor_job, true, false, false, false, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add triggered by job, ignoring: %m");
+ }
+
+ return 0;
+}
+
+Transaction *transaction_new(bool irreversible) {
+ Transaction *tr;
+
+ tr = new0(Transaction, 1);
+ if (!tr)
+ return NULL;
+
+ tr->jobs = hashmap_new(NULL);
+ if (!tr->jobs)
+ return mfree(tr);
+
+ tr->irreversible = irreversible;
+
+ return tr;
+}
+
+void transaction_free(Transaction *tr) {
+ assert(hashmap_isempty(tr->jobs));
+ hashmap_free(tr->jobs);
+ free(tr);
+}
diff --git a/src/core/transaction.h b/src/core/transaction.h
new file mode 100644
index 0000000..c431271
--- /dev/null
+++ b/src/core/transaction.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Transaction Transaction;
+
+#include "hashmap.h"
+#include "job.h"
+#include "manager.h"
+#include "unit.h"
+
+struct Transaction {
+ /* Jobs to be added */
+ Hashmap *jobs; /* Unit object => Job object list 1:1 */
+ Job *anchor_job; /* the job the user asked for */
+ bool irreversible;
+};
+
+Transaction *transaction_new(bool irreversible);
+void transaction_free(Transaction *tr);
+
+void transaction_add_propagate_reload_jobs(Transaction *tr, Unit *unit, Job *by, bool ignore_order, sd_bus_error *e);
+int transaction_add_job_and_dependencies(
+ Transaction *tr,
+ JobType type,
+ Unit *unit,
+ Job *by,
+ bool matters,
+ bool conflicts,
+ bool ignore_requirements,
+ bool ignore_order,
+ sd_bus_error *e);
+int transaction_activate(Transaction *tr, Manager *m, JobMode mode, Set *affected, sd_bus_error *e);
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m);
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u);
+void transaction_abort(Transaction *tr);
diff --git a/src/core/unit-dependency-atom.c b/src/core/unit-dependency-atom.c
new file mode 100644
index 0000000..8133352
--- /dev/null
+++ b/src/core/unit-dependency-atom.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "unit-dependency-atom.h"
+
+static const UnitDependencyAtom atom_map[_UNIT_DEPENDENCY_MAX] = {
+ /* A table that maps high-level dependency types to low-level dependency "atoms". The latter actually
+ * describe specific facets of dependency behaviour. The former combine them into one user-facing
+ * concept. Atoms are a bit mask, though a bunch of dependency types have only a single bit set.
+ *
+ * Typically when the user configures a dependency they go via dependency type, but when we act on
+ * them we go by atom.
+ *
+ * NB: when you add a new dependency type here, make sure to also add one to the (best-effort)
+ * reverse table in unit_dependency_from_unique_atom() further down. */
+
+ [UNIT_REQUIRES] = UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_REQUISITE] = UNIT_ATOM_PULL_IN_VERIFY |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_WANTS] = UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_FAIL |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_BINDS_TO] = UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_PART_OF] = UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_UPHOLDS] = UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_REQUIRED_BY] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_REQUISITE_OF] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_WANTED_BY] = UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+ [UNIT_BOUND_BY] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_UPHELD_BY] = UNIT_ATOM_START_STEADILY |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+ [UNIT_CONSISTS_OF] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART,
+
+ [UNIT_CONFLICTS] = UNIT_ATOM_PULL_IN_STOP |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START,
+
+ [UNIT_CONFLICTED_BY] = UNIT_ATOM_PULL_IN_STOP_IGNORED |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE,
+
+ [UNIT_PROPAGATES_STOP_TO] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP,
+
+ /* These are simple dependency types: they consist of a single atom only */
+ [UNIT_ON_FAILURE] = UNIT_ATOM_ON_FAILURE,
+ [UNIT_ON_SUCCESS] = UNIT_ATOM_ON_SUCCESS,
+ [UNIT_ON_FAILURE_OF] = UNIT_ATOM_ON_FAILURE_OF,
+ [UNIT_ON_SUCCESS_OF] = UNIT_ATOM_ON_SUCCESS_OF,
+ [UNIT_BEFORE] = UNIT_ATOM_BEFORE,
+ [UNIT_AFTER] = UNIT_ATOM_AFTER,
+ [UNIT_TRIGGERS] = UNIT_ATOM_TRIGGERS,
+ [UNIT_TRIGGERED_BY] = UNIT_ATOM_TRIGGERED_BY,
+ [UNIT_PROPAGATES_RELOAD_TO] = UNIT_ATOM_PROPAGATES_RELOAD_TO,
+ [UNIT_JOINS_NAMESPACE_OF] = UNIT_ATOM_JOINS_NAMESPACE_OF,
+ [UNIT_REFERENCES] = UNIT_ATOM_REFERENCES,
+ [UNIT_REFERENCED_BY] = UNIT_ATOM_REFERENCED_BY,
+ [UNIT_IN_SLICE] = UNIT_ATOM_IN_SLICE,
+ [UNIT_SLICE_OF] = UNIT_ATOM_SLICE_OF,
+
+ /* These are dependency types without effect on our state engine. We maintain them only to make
+ * things discoverable/debuggable as they are the inverse dependencies to some of the above. As they
+ * have no effect of their own, they all map to no atoms at all, i.e. the value 0. */
+ [UNIT_RELOAD_PROPAGATED_FROM] = 0,
+ [UNIT_STOP_PROPAGATED_FROM] = 0,
+};
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d) {
+ if (d < 0)
+ return _UNIT_DEPENDENCY_ATOM_INVALID;
+
+ assert(d < _UNIT_DEPENDENCY_MAX);
+
+ return atom_map[d];
+}
+
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom) {
+
+ /* This is a "best-effort" function that maps the specified 'atom' mask to a dependency type that is
+ * is equal to or has a superset of bits set if that's uniquely possible. The idea is that this
+ * function is used when iterating through deps that have a specific atom: if there's exactly one
+ * dependency type of the specific atom we don't need iterate through all deps a unit has, but can
+ * pinpoint things directly.
+ *
+ * This function will return _UNIT_DEPENDENCY_INVALID in case the specified value is not known or not
+ * uniquely defined, i.e. there are multiple dependencies with the atom or the combination set. */
+
+ switch ((int64_t) atom) {
+
+ /* Note that we can't list UNIT_REQUIRES here since it's a true subset of UNIT_BINDS_TO, and
+ * hence its atom bits not uniquely mappable. */
+
+ case UNIT_ATOM_PULL_IN_VERIFY |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_PULL_IN_VERIFY: /* a single dep type uses this atom */
+ return UNIT_REQUISITE;
+
+ case UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_FAIL |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_RETROACTIVE_START_FAIL:
+ return UNIT_WANTS;
+
+ case UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT:
+ return UNIT_BINDS_TO;
+
+ case UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE:
+ return UNIT_UPHOLDS;
+
+ case UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+ case UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE:
+ return UNIT_REQUISITE_OF;
+
+ case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+ case UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE:
+ return UNIT_BOUND_BY;
+
+ case UNIT_ATOM_START_STEADILY |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED:
+ case UNIT_ATOM_START_STEADILY:
+ return UNIT_UPHELD_BY;
+
+ case UNIT_ATOM_PULL_IN_STOP |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START:
+ case UNIT_ATOM_PULL_IN_STOP:
+ return UNIT_CONFLICTS;
+
+ case UNIT_ATOM_PULL_IN_STOP_IGNORED |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+ case UNIT_ATOM_PULL_IN_STOP_IGNORED:
+ case UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+ return UNIT_CONFLICTED_BY;
+
+ /* And now, the simple ones */
+
+ case UNIT_ATOM_ON_FAILURE:
+ return UNIT_ON_FAILURE;
+
+ case UNIT_ATOM_ON_SUCCESS:
+ return UNIT_ON_SUCCESS;
+
+ case UNIT_ATOM_ON_SUCCESS_OF:
+ return UNIT_ON_SUCCESS_OF;
+
+ case UNIT_ATOM_ON_FAILURE_OF:
+ return UNIT_ON_FAILURE_OF;
+
+ case UNIT_ATOM_BEFORE:
+ return UNIT_BEFORE;
+
+ case UNIT_ATOM_AFTER:
+ return UNIT_AFTER;
+
+ case UNIT_ATOM_TRIGGERS:
+ return UNIT_TRIGGERS;
+
+ case UNIT_ATOM_TRIGGERED_BY:
+ return UNIT_TRIGGERED_BY;
+
+ case UNIT_ATOM_PROPAGATES_RELOAD_TO:
+ return UNIT_PROPAGATES_RELOAD_TO;
+
+ case UNIT_ATOM_JOINS_NAMESPACE_OF:
+ return UNIT_JOINS_NAMESPACE_OF;
+
+ case UNIT_ATOM_REFERENCES:
+ return UNIT_REFERENCES;
+
+ case UNIT_ATOM_REFERENCED_BY:
+ return UNIT_REFERENCED_BY;
+
+ case UNIT_ATOM_IN_SLICE:
+ return UNIT_IN_SLICE;
+
+ case UNIT_ATOM_SLICE_OF:
+ return UNIT_SLICE_OF;
+
+ default:
+ return _UNIT_DEPENDENCY_INVALID;
+ }
+}
diff --git a/src/core/unit-dependency-atom.h b/src/core/unit-dependency-atom.h
new file mode 100644
index 0000000..02532e5
--- /dev/null
+++ b/src/core/unit-dependency-atom.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+
+#include "unit-def.h"
+
+/* Flags that identify the various "atomic" behaviours a specific dependency type implies. Each dependency is
+ * a combination of one or more of these flags that define what they actually entail. */
+typedef enum UnitDependencyAtom {
+
+ /* This unit pulls in the other unit as JOB_START job into the transaction, and if that doesn't work
+ * the transaction fails. */
+ UNIT_ATOM_PULL_IN_START = UINT64_C(1) << 0,
+ /* Similar, but if it doesn't work, ignore. */
+ UNIT_ATOM_PULL_IN_START_IGNORED = UINT64_C(1) << 1,
+ /* Pull in a JOB_VERIFY job into the transaction, i.e. pull in JOB_VERIFY rather than
+ * JOB_START. i.e. check the unit is started but don't pull it in. */
+ UNIT_ATOM_PULL_IN_VERIFY = UINT64_C(1) << 2,
+
+ /* Pull in a JOB_STOP job for the other job into transactions, and fail if that doesn't work. */
+ UNIT_ATOM_PULL_IN_STOP = UINT64_C(1) << 3,
+ /* Same, but don't fail, ignore it. */
+ UNIT_ATOM_PULL_IN_STOP_IGNORED = UINT64_C(1) << 4,
+
+ /* If our enters inactive state, add the other unit to the StopWhenUneeded= queue */
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE = UINT64_C(1) << 5,
+ /* Pin the other unit i.e. ensure StopWhenUneeded= won't trigger for the other unit as long as we are
+ * not in inactive state */
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED = UINT64_C(1) << 6,
+
+ /* Stop our unit if the other unit happens to inactive */
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT = UINT64_C(1) << 7,
+ /* If our unit enters inactive state, add the other unit to the BoundBy= queue */
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE = UINT64_C(1) << 8,
+
+ /* Start this unit whenever we find it inactive and the other unit active */
+ UNIT_ATOM_START_STEADILY = UINT64_C(1) << 9,
+ /* Whenever our unit becomes active, add other unit to start_when_upheld_queue */
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE = UINT64_C(1) << 10,
+
+ /* If our unit unexpectedly becomes active, retroactively start the other unit too, in "replace" job
+ * mode */
+ UNIT_ATOM_RETROACTIVE_START_REPLACE = UINT64_C(1) << 11,
+ /* Similar, but in "fail" job mode */
+ UNIT_ATOM_RETROACTIVE_START_FAIL = UINT64_C(1) << 12,
+ /* If our unit unexpectedly becomes active, retroactively stop the other unit too */
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START = UINT64_C(1) << 13,
+ /* If our unit unexpectedly becomes inactive, retroactively stop the other unit too */
+ UNIT_ATOM_RETROACTIVE_STOP_ON_STOP = UINT64_C(1) << 14,
+
+ /* If a start job for this unit fails, propagate the failure to start job of other unit too */
+ UNIT_ATOM_PROPAGATE_START_FAILURE = UINT64_C(1) << 15,
+ /* If a stop job for this unit fails, propagate the failure to any stop job of the other unit too */
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE = UINT64_C(1) << 16,
+ /* If our start job succeeded but the unit is inactive then (think: oneshot units), propagate this as
+ * failure to the other unit. */
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE = UINT64_C(1) << 17,
+ /* When putting together a transaction, propagate JOB_STOP from our unit to the other. */
+ UNIT_ATOM_PROPAGATE_STOP = UINT64_C(1) << 18,
+ /* When putting together a transaction, propagate JOB_RESTART from our unit to the other. */
+ UNIT_ATOM_PROPAGATE_RESTART = UINT64_C(1) << 19,
+
+ /* Add the other unit to the default target dependency queue */
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE = UINT64_C(1) << 20,
+ /* Recheck default target deps on other units (which are target units) */
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES = UINT64_C(1) << 21,
+
+ /* The remaining atoms map 1:1 to the equally named high-level deps */
+ UNIT_ATOM_ON_FAILURE = UINT64_C(1) << 22,
+ UNIT_ATOM_ON_SUCCESS = UINT64_C(1) << 23,
+ UNIT_ATOM_ON_FAILURE_OF = UINT64_C(1) << 24,
+ UNIT_ATOM_ON_SUCCESS_OF = UINT64_C(1) << 25,
+ UNIT_ATOM_BEFORE = UINT64_C(1) << 26,
+ UNIT_ATOM_AFTER = UINT64_C(1) << 27,
+ UNIT_ATOM_TRIGGERS = UINT64_C(1) << 28,
+ UNIT_ATOM_TRIGGERED_BY = UINT64_C(1) << 29,
+ UNIT_ATOM_PROPAGATES_RELOAD_TO = UINT64_C(1) << 30,
+ UNIT_ATOM_JOINS_NAMESPACE_OF = UINT64_C(1) << 31,
+ UNIT_ATOM_REFERENCES = UINT64_C(1) << 32,
+ UNIT_ATOM_REFERENCED_BY = UINT64_C(1) << 33,
+ UNIT_ATOM_IN_SLICE = UINT64_C(1) << 34,
+ UNIT_ATOM_SLICE_OF = UINT64_C(1) << 35,
+ _UNIT_DEPENDENCY_ATOM_MAX = (UINT64_C(1) << 36) - 1,
+ _UNIT_DEPENDENCY_ATOM_INVALID = -EINVAL,
+} UnitDependencyAtom;
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d);
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom);
diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c
new file mode 100644
index 0000000..e454df0
--- /dev/null
+++ b/src/core/unit-printf.c
@@ -0,0 +1,263 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "format-util.h"
+#include "macro.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "unit.h"
+#include "user-util.h"
+
+static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_to_prefix_and_instance(u->id, ret);
+}
+
+static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_to_prefix(u->id, ret);
+}
+
+static int specifier_prefix_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *p = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ r = unit_name_to_prefix(u->id, &p);
+ if (r < 0)
+ return r;
+
+ return unit_name_unescape(p, ret);
+}
+
+static int specifier_instance_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_unescape(strempty(u->instance), ret);
+}
+
+static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ _cleanup_free_ char *prefix = NULL;
+ char *dash;
+ int r;
+
+ r = unit_name_to_prefix(u->id, &prefix);
+ if (r < 0)
+ return r;
+
+ dash = strrchr(prefix, '-');
+ if (dash)
+ return specifier_string(specifier, dash + 1, root, userdata, ret);
+
+ *ret = TAKE_PTR(prefix);
+ return 0;
+}
+
+static int specifier_last_component_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ r = specifier_last_component(specifier, data, root, userdata, &p);
+ if (r < 0)
+ return r;
+
+ return unit_name_unescape(p, ret);
+}
+
+static int specifier_filename(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ if (u->instance)
+ return unit_name_path_unescape(u->instance, ret);
+ else
+ return unit_name_to_path(u->id, ret);
+}
+
+static void bad_specifier(const Unit *u, char specifier) {
+ log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier);
+}
+
+static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *n;
+
+ bad_specifier(u, specifier);
+
+ if (u->cgroup_path)
+ n = strdup(u->cgroup_path);
+ else
+ n = unit_default_cgroup_path(u);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *n;
+
+ bad_specifier(u, specifier);
+
+ n = strdup(u->manager->cgroup_root);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata), *slice;
+ char *n;
+
+ bad_specifier(u, specifier);
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ if (slice->cgroup_path)
+ n = strdup(slice->cgroup_path);
+ else
+ n = unit_default_cgroup_path(slice);
+ } else
+ n = strdup(u->manager->cgroup_root);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *n;
+
+ n = strdup(u->manager->prefix[PTR_TO_UINT(data)]);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *d;
+
+ assert(ret);
+
+ d = strjoin(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
+ if (!d)
+ return -ENOMEM;
+
+ *ret = d;
+ return 0;
+}
+
+int unit_name_printf(const Unit *u, const char* format, char **ret) {
+ /*
+ * This will use the passed string as format string and replace the following specifiers (which should all be
+ * safe for inclusion in unit names):
+ *
+ * %n: the full id of the unit (foo-aaa@bar.waldo)
+ * %N: the id of the unit without the suffix (foo-aaa@bar)
+ * %p: the prefix (foo-aaa)
+ * %i: the instance (bar)
+ * %j: the last component of the prefix (aaa)
+ */
+
+ const Specifier table[] = {
+ { 'i', specifier_string, u->instance },
+ { 'j', specifier_last_component, NULL },
+ { 'n', specifier_string, u->id },
+ { 'N', specifier_prefix_and_instance, NULL },
+ { 'p', specifier_prefix, NULL },
+
+ COMMON_SYSTEM_SPECIFIERS,
+
+ COMMON_CREDS_SPECIFIERS(u->manager->unit_file_scope),
+ {}
+ };
+
+ assert(u);
+ assert(format);
+ assert(ret);
+
+ return specifier_printf(format, UNIT_NAME_MAX, table, NULL, u, ret);
+}
+
+int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, char **ret) {
+ /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of
+ * additional codes (which are likely not suitable for unescaped inclusion in unit names):
+ *
+ * %f: the unescaped instance if set, otherwise the id unescaped as path
+ *
+ * %c: cgroup path of unit (deprecated)
+ * %r: where units in this slice are placed in the cgroup tree (deprecated)
+ * %R: the root of this systemd's instance tree (deprecated)
+ *
+ * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME)
+ * %d: the credentials directory ($CREDENTIALS_DIRECTORY)
+ * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME)
+ * %L: the log directory root (e.g. /var/log or $XDG_CONFIG_HOME/log)
+ * %S: the state directory root (e.g. /var/lib or $XDG_CONFIG_HOME)
+ * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR)
+ *
+ * %h: the homedir of the running user
+ * %s: the shell of the running user
+ *
+ * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of
+ * the unit file itself are broken by design, as they would resolve differently depending on whether
+ * they are used before or after the relevant configuration setting. Hence: don't add them.
+ */
+
+ assert(u);
+ assert(format);
+ assert(ret);
+
+ const Specifier table[] = {
+ { 'i', specifier_string, u->instance },
+ { 'I', specifier_instance_unescaped, NULL },
+ { 'j', specifier_last_component, NULL },
+ { 'J', specifier_last_component_unescaped, NULL },
+ { 'n', specifier_string, u->id },
+ { 'N', specifier_prefix_and_instance, NULL },
+ { 'p', specifier_prefix, NULL },
+ { 'P', specifier_prefix_unescaped, NULL },
+
+ { 'f', specifier_filename, NULL },
+ { 'y', specifier_real_path, u->fragment_path },
+ { 'Y', specifier_real_directory, u->fragment_path },
+
+ { 'c', specifier_cgroup, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+ { 'r', specifier_cgroup_slice, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+ { 'R', specifier_cgroup_root, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+
+ { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) },
+ { 'd', specifier_credentials_dir, NULL },
+ { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) },
+ { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) },
+ { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) },
+ { 't', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) },
+
+ { 'h', specifier_user_home, NULL },
+ { 's', specifier_user_shell, NULL },
+
+ COMMON_SYSTEM_SPECIFIERS,
+
+ COMMON_CREDS_SPECIFIERS(u->manager->unit_file_scope),
+
+ COMMON_TMP_SPECIFIERS,
+ {}
+ };
+
+ return specifier_printf(format, max_length, table, NULL, u, ret);
+}
diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h
new file mode 100644
index 0000000..2df07db
--- /dev/null
+++ b/src/core/unit-printf.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "creds-util.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "unit.h"
+
+int unit_name_printf(const Unit *u, const char* text, char **ret);
+int unit_full_printf_full(const Unit *u, const char *text, size_t max_length, char **ret);
+static inline int unit_full_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, LONG_LINE_MAX, ret);
+}
+static inline int unit_path_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, PATH_MAX-1, ret);
+}
+static inline int unit_fd_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, FDNAME_MAX, ret);
+}
+static inline int unit_cred_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, CREDENTIAL_NAME_MAX, ret);
+}
+static inline int unit_env_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, sc_arg_max(), ret);
+}
diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c
new file mode 100644
index 0000000..21457dc
--- /dev/null
+++ b/src/core/unit-serialize.c
@@ -0,0 +1,864 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-socket-bind.h"
+#include "bus-util.h"
+#include "dbus.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "parse-util.h"
+#include "restrict-ifaces.h"
+#include "serialize.h"
+#include "string-table.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+
+static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (mask == 0)
+ return 0;
+
+ r = cg_mask_to_string(mask, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to format cgroup mask: %m");
+
+ return serialize_item(f, key, s);
+}
+
+/* Make sure out values fit in the bitfield. */
+assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+static int serialize_markers(FILE *f, unsigned markers) {
+ assert(f);
+
+ if (markers == 0)
+ return 0;
+
+ fputs("markers=", f);
+ for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+ if (FLAGS_SET(markers, 1u << m))
+ fputs(unit_marker_to_string(m), f);
+ fputc('\n', f);
+ return 0;
+}
+
+static int deserialize_markers(Unit *u, const char *value) {
+ assert(u);
+ assert(value);
+ int r;
+
+ for (const char *p = value;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r <= 0)
+ return r;
+
+ UnitMarker m = unit_marker_from_string(word);
+ if (m < 0) {
+ log_unit_debug_errno(u, m, "Unknown unit marker \"%s\", ignoring.", word);
+ continue;
+ }
+
+ u->markers |= 1u << m;
+ }
+}
+
+static const char *const ip_accounting_metric_field[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes",
+ [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
+ [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes",
+ [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets",
+};
+
+static const char *const io_accounting_metric_field_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
+};
+
+static const char *const io_accounting_metric_field_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
+};
+
+int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ if (switching_root && UNIT_VTABLE(u)->exclude_from_switch_root_serialization) {
+ /* In the new root, paths for mounts and automounts will be different, so it doesn't make
+ * much sense to serialize things. API file systems will be moved to the new root, but we
+ * don't have mount units for those. */
+ log_unit_debug(u, "not serializing before switch-root");
+ return 0;
+ }
+
+ /* Start marker */
+ fputs(u->id, f);
+ fputc('\n', f);
+
+ assert(!!UNIT_VTABLE(u)->serialize == !!UNIT_VTABLE(u)->deserialize_item);
+
+ if (UNIT_VTABLE(u)->serialize) {
+ r = UNIT_VTABLE(u)->serialize(u, f, fds);
+ if (r < 0)
+ return r;
+ }
+
+ (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp);
+
+ (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp);
+ (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp);
+ (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp);
+ (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp);
+
+ (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp);
+ (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp);
+
+ if (dual_timestamp_is_set(&u->condition_timestamp))
+ (void) serialize_bool(f, "condition-result", u->condition_result);
+
+ if (dual_timestamp_is_set(&u->assert_timestamp))
+ (void) serialize_bool(f, "assert-result", u->assert_result);
+
+ (void) serialize_bool(f, "transient", u->transient);
+ (void) serialize_bool(f, "in-audit", u->in_audit);
+
+ (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id);
+ (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max);
+ (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields);
+ (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval);
+ (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst);
+
+ (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base);
+ if (u->cpu_usage_last != NSEC_INFINITY)
+ (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
+
+ if (u->managed_oom_kill_last > 0)
+ (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last);
+
+ if (u->oom_kill_last > 0)
+ (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
+
+ for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
+ (void) serialize_item_format(f, io_accounting_metric_field_base[im], "%" PRIu64, u->io_accounting_base[im]);
+
+ if (u->io_accounting_last[im] != UINT64_MAX)
+ (void) serialize_item_format(f, io_accounting_metric_field_last[im], "%" PRIu64, u->io_accounting_last[im]);
+ }
+
+ if (u->cgroup_path)
+ (void) serialize_item(f, "cgroup", u->cgroup_path);
+
+ (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized);
+ (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask);
+
+ (void) bpf_serialize_socket_bind(u, f, fds);
+
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed);
+
+ (void) serialize_restrict_network_interfaces(u, f, fds);
+
+ if (uid_is_valid(u->ref_uid))
+ (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid);
+ if (gid_is_valid(u->ref_gid))
+ (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid);
+
+ if (!sd_id128_is_null(u->invocation_id))
+ (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+ (void) serialize_item_format(f, "freezer-state", "%s", freezer_state_to_string(unit_freezer_state(u)));
+ (void) serialize_markers(f, u->markers);
+
+ bus_track_serialize(u->bus_track, f, "ref");
+
+ for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+ uint64_t v;
+
+ r = unit_get_ip_accounting(u, m, &v);
+ if (r >= 0)
+ (void) serialize_item_format(f, ip_accounting_metric_field[m], "%" PRIu64, v);
+ }
+
+ if (!switching_root) {
+ if (u->job) {
+ fputs("job\n", f);
+ job_serialize(u->job, f);
+ }
+
+ if (u->nop_job) {
+ fputs("job\n", f);
+ job_serialize(u->nop_job, f);
+ }
+ }
+
+ /* End marker */
+ fputc('\n', f);
+ return 0;
+}
+
+static int unit_deserialize_job(Unit *u, FILE *f) {
+ _cleanup_(job_freep) Job *j = NULL;
+ int r;
+
+ assert(u);
+ assert(f);
+
+ j = job_new_raw(u);
+ if (!j)
+ return log_oom();
+
+ r = job_deserialize(j, f);
+ if (r < 0)
+ return r;
+
+ r = job_install_deserialized(j);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(j);
+ return 0;
+}
+
+#define MATCH_DESERIALIZE(key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ int _deserialize_r = parse_func(v); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring.", l, v); \
+ else \
+ target = _deserialize_r; \
+ }; \
+ _deserialize_matched; \
+ })
+
+#define MATCH_DESERIALIZE_IMMEDIATE(key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ int _deserialize_r = parse_func(v, &target); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring", l, v); \
+ }; \
+ _deserialize_matched; \
+ })
+
+int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *l, *v;
+ ssize_t m;
+ size_t k;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0) /* eof */
+ break;
+
+ l = strstrip(line);
+ if (isempty(l)) /* End marker */
+ break;
+
+ k = strcspn(l, "=");
+
+ if (l[k] == '=') {
+ l[k] = 0;
+ v = l+k+1;
+ } else
+ v = l+k;
+
+ if (streq(l, "job")) {
+ if (v[0] == '\0') {
+ /* New-style serialized job */
+ r = unit_deserialize_job(u, f);
+ if (r < 0)
+ return r;
+ } else /* Legacy for pre-44 */
+ log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v);
+ continue;
+ } else if (streq(l, "state-change-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->state_change_timestamp);
+ continue;
+ } else if (streq(l, "inactive-exit-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp);
+ continue;
+ } else if (streq(l, "active-enter-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp);
+ continue;
+ } else if (streq(l, "active-exit-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp);
+ continue;
+ } else if (streq(l, "inactive-enter-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp);
+ continue;
+ } else if (streq(l, "condition-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->condition_timestamp);
+ continue;
+ } else if (streq(l, "assert-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->assert_timestamp);
+ continue;
+
+ } else if (MATCH_DESERIALIZE("condition-result", l, v, parse_boolean, u->condition_result))
+ continue;
+
+ else if (MATCH_DESERIALIZE("assert-result", l, v, parse_boolean, u->assert_result))
+ continue;
+
+ else if (MATCH_DESERIALIZE("transient", l, v, parse_boolean, u->transient))
+ continue;
+
+ else if (MATCH_DESERIALIZE("in-audit", l, v, parse_boolean, u->in_audit))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-invocation-id", l, v, parse_boolean, u->exported_invocation_id))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-level-max", l, v, parse_boolean, u->exported_log_level_max))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-extra-fields", l, v, parse_boolean, u->exported_log_extra_fields))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-rate-limit-interval", l, v, parse_boolean, u->exported_log_ratelimit_interval))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) ||
+ MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last))
+ continue;
+
+ else if (streq(l, "cgroup")) {
+ r = unit_set_cgroup_path(u, v);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
+
+ (void) unit_watch_cgroup(u);
+ (void) unit_watch_cgroup_memory(u);
+
+ continue;
+
+ } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask))
+ continue;
+
+ else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
+ int fd;
+
+ if (safe_atoi(v, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ log_unit_debug(u, "Failed to parse %s value: %s, ignoring.", l, v);
+ else {
+ if (fdset_remove(fds, fd) < 0) {
+ log_unit_debug(u, "Failed to remove %s value=%d from fdset", l, fd);
+ continue;
+ }
+
+ (void) bpf_socket_bind_add_initial_link_fd(u, fd);
+ }
+ continue;
+
+ } else if (streq(l, "ip-bpf-ingress-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed);
+ continue;
+ } else if (streq(l, "ip-bpf-egress-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed);
+ continue;
+ } else if (streq(l, "bpf-device-control-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed);
+ continue;
+
+ } else if (streq(l, "ip-bpf-custom-ingress-installed")) {
+ (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed);
+ continue;
+ } else if (streq(l, "ip-bpf-custom-egress-installed")) {
+ (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed);
+ continue;
+
+ } else if (streq(l, "restrict-ifaces-bpf-fd")) {
+ int fd;
+
+ if (safe_atoi(v, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Failed to parse restrict-ifaces-bpf-fd value: %s", v);
+ continue;
+ }
+ if (fdset_remove(fds, fd) < 0) {
+ log_unit_debug(u, "Failed to remove restrict-ifaces-bpf-fd %d from fdset", fd);
+ continue;
+ }
+
+ (void) restrict_network_interfaces_add_initial_link_fd(u, fd);
+ continue;
+
+ } else if (streq(l, "ref-uid")) {
+ uid_t uid;
+
+ r = parse_uid(v, &uid);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else
+ unit_ref_uid_gid(u, uid, GID_INVALID);
+ continue;
+
+ } else if (streq(l, "ref-gid")) {
+ gid_t gid;
+
+ r = parse_gid(v, &gid);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else
+ unit_ref_uid_gid(u, UID_INVALID, gid);
+ continue;
+
+ } else if (streq(l, "ref")) {
+ r = strv_extend(&u->deserialized_refs, v);
+ if (r < 0)
+ return log_oom();
+ continue;
+
+ } else if (streq(l, "invocation-id")) {
+ sd_id128_t id;
+
+ r = sd_id128_from_string(v, &id);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else {
+ r = unit_set_invocation_id(u, id);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m");
+ }
+
+ continue;
+
+ } else if (MATCH_DESERIALIZE("freezer-state", l, v, freezer_state_from_string, u->freezer_state))
+ continue;
+
+ else if (streq(l, "markers")) {
+ r = deserialize_markers(u, v);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to deserialize \"%s=%s\", ignoring: %m", l, v);
+ continue;
+ }
+
+ /* Check if this is an IP accounting metric serialization field */
+ m = string_table_lookup(ip_accounting_metric_field, ELEMENTSOF(ip_accounting_metric_field), l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v);
+ else
+ u->ip_accounting_extra[m] = c;
+ continue;
+ }
+
+ m = string_table_lookup(io_accounting_metric_field_base, ELEMENTSOF(io_accounting_metric_field_base), l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v);
+ else
+ u->io_accounting_base[m] = c;
+ continue;
+ }
+
+ m = string_table_lookup(io_accounting_metric_field_last, ELEMENTSOF(io_accounting_metric_field_last), l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v);
+ else
+ u->io_accounting_last[m] = c;
+ continue;
+ }
+
+ r = exec_runtime_deserialize_compat(u, l, v, fds);
+ if (r < 0) {
+ log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l);
+ continue;
+ } else if (r > 0)
+ /* Returns positive if key was handled by the call */
+ continue;
+
+ if (UNIT_VTABLE(u)->deserialize_item) {
+ r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds);
+ if (r < 0)
+ log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l);
+ }
+ }
+
+ /* Versions before 228 did not carry a state change timestamp. In this case, take the current
+ * time. This is useful, so that timeouts based on this timestamp don't trigger too early, and is
+ * in-line with the logic from before 228 where the base for timeouts was not persistent across
+ * reboots. */
+
+ if (!dual_timestamp_is_set(&u->state_change_timestamp))
+ dual_timestamp_get(&u->state_change_timestamp);
+
+ /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
+ * applied after we are done. For that we invalidate anything already realized, so that we can
+ * realize it again. */
+ if (u->cgroup_realized) {
+ unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
+ unit_invalidate_cgroup_bpf(u);
+ }
+
+ return 0;
+}
+
+int unit_deserialize_skip(FILE *f) {
+ int r;
+ assert(f);
+
+ /* Skip serialized data for this unit. We don't know what it is. */
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *l;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ return 0;
+
+ l = strstrip(line);
+
+ /* End marker */
+ if (isempty(l))
+ return 1;
+ }
+}
+
+static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) {
+ const struct {
+ UnitDependencyMask mask;
+ const char *name;
+ } table[] = {
+ { UNIT_DEPENDENCY_FILE, "file" },
+ { UNIT_DEPENDENCY_IMPLICIT, "implicit" },
+ { UNIT_DEPENDENCY_DEFAULT, "default" },
+ { UNIT_DEPENDENCY_UDEV, "udev" },
+ { UNIT_DEPENDENCY_PATH, "path" },
+ { UNIT_DEPENDENCY_MOUNT_FILE, "mount-file" },
+ { UNIT_DEPENDENCY_MOUNTINFO, "mountinfo" },
+ { UNIT_DEPENDENCY_PROC_SWAP, "proc-swap" },
+ { UNIT_DEPENDENCY_SLICE_PROPERTY, "slice-property" },
+ };
+
+ assert(f);
+ assert(kind);
+ assert(space);
+
+ for (size_t i = 0; i < ELEMENTSOF(table); i++) {
+
+ if (mask == 0)
+ break;
+
+ if (FLAGS_SET(mask, table[i].mask)) {
+ if (*space)
+ fputc(' ', f);
+ else
+ *space = true;
+
+ fputs(kind, f);
+ fputs("-", f);
+ fputs(table[i].name, f);
+
+ mask &= ~table[i].mask;
+ }
+ }
+
+ assert(mask == 0);
+}
+
+void unit_dump(Unit *u, FILE *f, const char *prefix) {
+ char *t;
+ const char *prefix2;
+ Unit *following;
+ _cleanup_set_free_ Set *following_set = NULL;
+ CGroupMask m;
+ int r;
+
+ assert(u);
+ assert(u->type >= 0);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%s-> Unit %s:\n",
+ prefix, u->id);
+
+ SET_FOREACH(t, u->aliases)
+ fprintf(f, "%s\tAlias: %s\n", prefix, t);
+
+ fprintf(f,
+ "%s\tDescription: %s\n"
+ "%s\tInstance: %s\n"
+ "%s\tUnit Load State: %s\n"
+ "%s\tUnit Active State: %s\n"
+ "%s\tState Change Timestamp: %s\n"
+ "%s\tInactive Exit Timestamp: %s\n"
+ "%s\tActive Enter Timestamp: %s\n"
+ "%s\tActive Exit Timestamp: %s\n"
+ "%s\tInactive Enter Timestamp: %s\n"
+ "%s\tMay GC: %s\n"
+ "%s\tNeed Daemon Reload: %s\n"
+ "%s\tTransient: %s\n"
+ "%s\tPerpetual: %s\n"
+ "%s\tGarbage Collection Mode: %s\n",
+ prefix, unit_description(u),
+ prefix, strna(u->instance),
+ prefix, unit_load_state_to_string(u->load_state),
+ prefix, unit_active_state_to_string(unit_active_state(u)),
+ prefix, strna(FORMAT_TIMESTAMP(u->state_change_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->inactive_exit_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->active_enter_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->active_exit_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->inactive_enter_timestamp.realtime)),
+ prefix, yes_no(unit_may_gc(u)),
+ prefix, yes_no(unit_need_daemon_reload(u)),
+ prefix, yes_no(u->transient),
+ prefix, yes_no(u->perpetual),
+ prefix, collect_mode_to_string(u->collect_mode));
+
+ if (u->markers != 0) {
+ fprintf(f, "%s\tMarkers:", prefix);
+
+ for (UnitMarker marker = 0; marker < _UNIT_MARKER_MAX; marker++)
+ if (FLAGS_SET(u->markers, 1u << marker))
+ fprintf(f, " %s", unit_marker_to_string(marker));
+ fputs("\n", f);
+ }
+
+ if (UNIT_HAS_CGROUP_CONTEXT(u)) {
+ fprintf(f,
+ "%s\tSlice: %s\n"
+ "%s\tCGroup: %s\n"
+ "%s\tCGroup realized: %s\n",
+ prefix, strna(unit_slice_name(u)),
+ prefix, strna(u->cgroup_path),
+ prefix, yes_no(u->cgroup_realized));
+
+ if (u->cgroup_realized_mask != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(u->cgroup_realized_mask, &s);
+ fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s));
+ }
+
+ if (u->cgroup_enabled_mask != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(u->cgroup_enabled_mask, &s);
+ fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_own_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_members_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_delegate_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s));
+ }
+ }
+
+ if (!sd_id128_is_null(u->invocation_id))
+ fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n",
+ prefix, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+ STRV_FOREACH(j, u->documentation)
+ fprintf(f, "%s\tDocumentation: %s\n", prefix, *j);
+
+ if (u->access_selinux_context)
+ fprintf(f, "%s\tAccess SELinux Context: %s\n", prefix, u->access_selinux_context);
+
+ following = unit_following(u);
+ if (following)
+ fprintf(f, "%s\tFollowing: %s\n", prefix, following->id);
+
+ r = unit_following_set(u, &following_set);
+ if (r >= 0) {
+ Unit *other;
+
+ SET_FOREACH(other, following_set)
+ fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id);
+ }
+
+ if (u->fragment_path)
+ fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path);
+
+ if (u->source_path)
+ fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path);
+
+ STRV_FOREACH(j, u->dropin_paths)
+ fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j);
+
+ if (u->failure_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action));
+ if (u->failure_action_exit_status >= 0)
+ fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status);
+ if (u->success_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action));
+ if (u->success_action_exit_status >= 0)
+ fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status);
+
+ if (u->job_timeout != USEC_INFINITY)
+ fprintf(f, "%s\tJob Timeout: %s\n", prefix, FORMAT_TIMESPAN(u->job_timeout, 0));
+
+ if (u->job_timeout_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action));
+
+ if (u->job_timeout_reboot_arg)
+ fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg);
+
+ condition_dump_list(u->conditions, f, prefix, condition_type_to_string);
+ condition_dump_list(u->asserts, f, prefix, assert_type_to_string);
+
+ if (dual_timestamp_is_set(&u->condition_timestamp))
+ fprintf(f,
+ "%s\tCondition Timestamp: %s\n"
+ "%s\tCondition Result: %s\n",
+ prefix, strna(FORMAT_TIMESTAMP(u->condition_timestamp.realtime)),
+ prefix, yes_no(u->condition_result));
+
+ if (dual_timestamp_is_set(&u->assert_timestamp))
+ fprintf(f,
+ "%s\tAssert Timestamp: %s\n"
+ "%s\tAssert Result: %s\n",
+ prefix, strna(FORMAT_TIMESTAMP(u->assert_timestamp.realtime)),
+ prefix, yes_no(u->assert_result));
+
+ for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) {
+ UnitDependencyInfo di;
+ Unit *other;
+
+ HASHMAP_FOREACH_KEY(di.data, other, unit_get_dependencies(u, d)) {
+ bool space = false;
+
+ fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id);
+
+ print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+ print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+ fputs(")\n", f);
+ }
+ }
+
+ if (!hashmap_isempty(u->requires_mounts_for)) {
+ UnitDependencyInfo di;
+ const char *path;
+
+ HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+ bool space = false;
+
+ fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path);
+
+ print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+ print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+ fputs(")\n", f);
+ }
+ }
+
+ if (u->load_state == UNIT_LOADED) {
+
+ fprintf(f,
+ "%s\tStopWhenUnneeded: %s\n"
+ "%s\tRefuseManualStart: %s\n"
+ "%s\tRefuseManualStop: %s\n"
+ "%s\tDefaultDependencies: %s\n"
+ "%s\tOnSuccessJobMode: %s\n"
+ "%s\tOnFailureJobMode: %s\n"
+ "%s\tIgnoreOnIsolate: %s\n",
+ prefix, yes_no(u->stop_when_unneeded),
+ prefix, yes_no(u->refuse_manual_start),
+ prefix, yes_no(u->refuse_manual_stop),
+ prefix, yes_no(u->default_dependencies),
+ prefix, job_mode_to_string(u->on_success_job_mode),
+ prefix, job_mode_to_string(u->on_failure_job_mode),
+ prefix, yes_no(u->ignore_on_isolate));
+
+ if (UNIT_VTABLE(u)->dump)
+ UNIT_VTABLE(u)->dump(u, f, prefix2);
+
+ } else if (u->load_state == UNIT_MERGED)
+ fprintf(f,
+ "%s\tMerged into: %s\n",
+ prefix, u->merged_into->id);
+ else if (u->load_state == UNIT_ERROR) {
+ errno = abs(u->load_error);
+ fprintf(f, "%s\tLoad Error Code: %m\n", prefix);
+ }
+
+ for (const char *n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track))
+ fprintf(f, "%s\tBus Ref: %s\n", prefix, n);
+
+ if (u->job)
+ job_dump(u->job, f, prefix2);
+
+ if (u->nop_job)
+ job_dump(u->nop_job, f, prefix2);
+}
diff --git a/src/core/unit-serialize.h b/src/core/unit-serialize.h
new file mode 100644
index 0000000..599d883
--- /dev/null
+++ b/src/core/unit-serialize.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "unit.h"
+#include "fdset.h"
+
+int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs);
+int unit_deserialize(Unit *u, FILE *f, FDSet *fds);
+int unit_deserialize_skip(FILE *f);
+
+void unit_dump(Unit *u, FILE *f, const char *prefix);
diff --git a/src/core/unit.c b/src/core/unit.c
new file mode 100644
index 0000000..52df7ce
--- /dev/null
+++ b/src/core/unit.c
@@ -0,0 +1,6111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "bus-common-errors.h"
+#include "bus-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "chase-symlinks.h"
+#include "core-varlink.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "dropin.h"
+#include "escape.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "id128-util.h"
+#include "install.h"
+#include "io-util.h"
+#include "label.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "macro.h"
+#include "missing_audit.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "serialize.h"
+#include "set.h"
+#include "signal-util.h"
+#include "sparse-endian.h"
+#include "special.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+#if BPF_FRAMEWORK
+#include "bpf-link.h"
+#endif
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC)
+#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL)
+#define MENTIONWORTHY_IP_BYTES (0ULL)
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */
+#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */
+#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */
+
+const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = {
+ [UNIT_SERVICE] = &service_vtable,
+ [UNIT_SOCKET] = &socket_vtable,
+ [UNIT_TARGET] = &target_vtable,
+ [UNIT_DEVICE] = &device_vtable,
+ [UNIT_MOUNT] = &mount_vtable,
+ [UNIT_AUTOMOUNT] = &automount_vtable,
+ [UNIT_SWAP] = &swap_vtable,
+ [UNIT_TIMER] = &timer_vtable,
+ [UNIT_PATH] = &path_vtable,
+ [UNIT_SLICE] = &slice_vtable,
+ [UNIT_SCOPE] = &scope_vtable,
+};
+
+Unit* unit_new(Manager *m, size_t size) {
+ Unit *u;
+
+ assert(m);
+ assert(size >= sizeof(Unit));
+
+ u = malloc0(size);
+ if (!u)
+ return NULL;
+
+ u->manager = m;
+ u->type = _UNIT_TYPE_INVALID;
+ u->default_dependencies = true;
+ u->unit_file_state = _UNIT_FILE_STATE_INVALID;
+ u->unit_file_preset = -1;
+ u->on_failure_job_mode = JOB_REPLACE;
+ u->on_success_job_mode = JOB_FAIL;
+ u->cgroup_control_inotify_wd = -1;
+ u->cgroup_memory_inotify_wd = -1;
+ u->job_timeout = USEC_INFINITY;
+ u->job_running_timeout = USEC_INFINITY;
+ u->ref_uid = UID_INVALID;
+ u->ref_gid = GID_INVALID;
+ u->cpu_usage_last = NSEC_INFINITY;
+ u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+ u->failure_action_exit_status = u->success_action_exit_status = -1;
+
+ u->ip_accounting_ingress_map_fd = -1;
+ u->ip_accounting_egress_map_fd = -1;
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
+ u->io_accounting_last[i] = UINT64_MAX;
+
+ u->ipv4_allow_map_fd = -1;
+ u->ipv6_allow_map_fd = -1;
+ u->ipv4_deny_map_fd = -1;
+ u->ipv6_deny_map_fd = -1;
+
+ u->last_section_private = -1;
+
+ u->start_ratelimit = (RateLimit) { m->default_start_limit_interval, m->default_start_limit_burst };
+ u->auto_start_stop_ratelimit = (RateLimit) { 10 * USEC_PER_SEC, 16 };
+
+ return u;
+}
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) {
+ _cleanup_(unit_freep) Unit *u = NULL;
+ int r;
+
+ u = unit_new(m, size);
+ if (!u)
+ return -ENOMEM;
+
+ r = unit_add_name(u, name);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(u);
+
+ return r;
+}
+
+bool unit_has_name(const Unit *u, const char *name) {
+ assert(u);
+ assert(name);
+
+ return streq_ptr(name, u->id) ||
+ set_contains(u->aliases, name);
+}
+
+static void unit_init(Unit *u) {
+ CGroupContext *cc;
+ ExecContext *ec;
+ KillContext *kc;
+
+ assert(u);
+ assert(u->manager);
+ assert(u->type >= 0);
+
+ cc = unit_get_cgroup_context(u);
+ if (cc) {
+ cgroup_context_init(cc);
+
+ /* Copy in the manager defaults into the cgroup
+ * context, _before_ the rest of the settings have
+ * been initialized */
+
+ cc->cpu_accounting = u->manager->default_cpu_accounting;
+ cc->io_accounting = u->manager->default_io_accounting;
+ cc->blockio_accounting = u->manager->default_blockio_accounting;
+ cc->memory_accounting = u->manager->default_memory_accounting;
+ cc->tasks_accounting = u->manager->default_tasks_accounting;
+ cc->ip_accounting = u->manager->default_ip_accounting;
+
+ if (u->type != UNIT_SLICE)
+ cc->tasks_max = u->manager->default_tasks_max;
+ }
+
+ ec = unit_get_exec_context(u);
+ if (ec) {
+ exec_context_init(ec);
+
+ if (u->manager->default_oom_score_adjust_set) {
+ ec->oom_score_adjust = u->manager->default_oom_score_adjust;
+ ec->oom_score_adjust_set = true;
+ }
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ ec->keyring_mode = EXEC_KEYRING_SHARED;
+ else {
+ ec->keyring_mode = EXEC_KEYRING_INHERIT;
+
+ /* User manager might have its umask redefined by PAM or UMask=. In this
+ * case let the units it manages inherit this value by default. They can
+ * still tune this value through their own unit file */
+ (void) get_process_umask(getpid_cached(), &ec->umask);
+ }
+ }
+
+ kc = unit_get_kill_context(u);
+ if (kc)
+ kill_context_init(kc);
+
+ if (UNIT_VTABLE(u)->init)
+ UNIT_VTABLE(u)->init(u);
+}
+
+static int unit_add_alias(Unit *u, char *donated_name) {
+ int r;
+
+ /* Make sure that u->names is allocated. We may leave u->names
+ * empty if we fail later, but this is not a problem. */
+ r = set_ensure_put(&u->aliases, &string_hash_ops, donated_name);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+
+ return 0;
+}
+
+int unit_add_name(Unit *u, const char *text) {
+ _cleanup_free_ char *name = NULL, *instance = NULL;
+ UnitType t;
+ int r;
+
+ assert(u);
+ assert(text);
+
+ if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "instance is not set when adding name '%s': %m", text);
+
+ r = unit_name_replace_instance(text, u->instance, &name);
+ if (r < 0)
+ return log_unit_debug_errno(u, r,
+ "failed to build instance name from '%s': %m", text);
+ } else {
+ name = strdup(text);
+ if (!name)
+ return -ENOMEM;
+ }
+
+ if (unit_has_name(u, name))
+ return 0;
+
+ if (hashmap_contains(u->manager->units, name))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+ "unit already exist when adding name '%s': %m", name);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "name '%s' is invalid: %m", name);
+
+ t = unit_name_to_type(name);
+ if (t < 0)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "failed to derive unit type from name '%s': %m", name);
+
+ if (u->type != _UNIT_TYPE_INVALID && t != u->type)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m",
+ u->type, t, name);
+
+ r = unit_name_to_instance(name, &instance);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name);
+
+ if (instance && !unit_type_may_template(t))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name);
+
+ /* Ensure that this unit either has no instance, or that the instance matches. */
+ if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "cannot add name %s, the instances don't match (\"%s\" != \"%s\").",
+ name, instance, u->instance);
+
+ if (u->id && !unit_type_may_alias(t))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+ "cannot add name %s, aliases are not allowed for %s units.",
+ name, unit_type_to_string(t));
+
+ if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES)
+ return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m");
+
+ /* Add name to the global hashmap first, because that's easier to undo */
+ r = hashmap_put(u->manager->units, name, u);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text);
+
+ if (u->id) {
+ r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */
+ if (r < 0) {
+ hashmap_remove(u->manager->units, name);
+ return r;
+ }
+ TAKE_PTR(name);
+
+ } else {
+ /* A new name, we don't need the set yet. */
+ assert(u->type == _UNIT_TYPE_INVALID);
+ assert(!u->instance);
+
+ u->type = t;
+ u->id = TAKE_PTR(name);
+ u->instance = TAKE_PTR(instance);
+
+ LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u);
+ unit_init(u);
+ }
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+}
+
+int unit_choose_id(Unit *u, const char *name) {
+ _cleanup_free_ char *t = NULL;
+ char *s;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return -EINVAL;
+
+ r = unit_name_replace_instance(name, u->instance, &t);
+ if (r < 0)
+ return r;
+
+ name = t;
+ }
+
+ if (streq_ptr(u->id, name))
+ return 0; /* Nothing to do. */
+
+ /* Selects one of the aliases of this unit as the id */
+ s = set_get(u->aliases, (char*) name);
+ if (!s)
+ return -ENOENT;
+
+ if (u->id) {
+ r = set_remove_and_put(u->aliases, name, u->id);
+ if (r < 0)
+ return r;
+ } else
+ assert_se(set_remove(u->aliases, name)); /* see set_get() above… */
+
+ u->id = s; /* Old u->id is now stored in the set, and s is not stored anywhere */
+ unit_add_to_dbus_queue(u);
+
+ return 0;
+}
+
+int unit_set_description(Unit *u, const char *description) {
+ int r;
+
+ assert(u);
+
+ r = free_and_strdup(&u->description, empty_to_null(description));
+ if (r < 0)
+ return r;
+ if (r > 0)
+ unit_add_to_dbus_queue(u);
+
+ return 0;
+}
+
+static bool unit_success_failure_handler_has_jobs(Unit *unit) {
+ Unit *other;
+
+ UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_SUCCESS)
+ if (other->job || other->nop_job)
+ return true;
+
+ UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_FAILURE)
+ if (other->job || other->nop_job)
+ return true;
+
+ return false;
+}
+
+bool unit_may_gc(Unit *u) {
+ UnitActiveState state;
+ int r;
+
+ assert(u);
+
+ /* Checks whether the unit is ready to be unloaded for garbage collection.
+ * Returns true when the unit may be collected, and false if there's some
+ * reason to keep it loaded.
+ *
+ * References from other units are *not* checked here. Instead, this is done
+ * in unit_gc_sweep(), but using markers to properly collect dependency loops.
+ */
+
+ if (u->job || u->nop_job)
+ return false;
+
+ state = unit_active_state(u);
+
+ /* If the unit is inactive and failed and no job is queued for it, then release its runtime resources */
+ if (UNIT_IS_INACTIVE_OR_FAILED(state) &&
+ UNIT_VTABLE(u)->release_resources)
+ UNIT_VTABLE(u)->release_resources(u);
+
+ if (u->perpetual)
+ return false;
+
+ if (sd_bus_track_count(u->bus_track) > 0)
+ return false;
+
+ /* But we keep the unit object around for longer when it is referenced or configured to not be gc'ed */
+ switch (u->collect_mode) {
+
+ case COLLECT_INACTIVE:
+ if (state != UNIT_INACTIVE)
+ return false;
+
+ break;
+
+ case COLLECT_INACTIVE_OR_FAILED:
+ if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+ return false;
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Check if any OnFailure= or on Success= jobs may be pending */
+ if (unit_success_failure_handler_has_jobs(u))
+ return false;
+
+ if (u->cgroup_path) {
+ /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
+ * around. Units with active processes should never be collected. */
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+ if (r <= 0)
+ return false;
+ }
+
+ if (UNIT_VTABLE(u)->may_gc && !UNIT_VTABLE(u)->may_gc(u))
+ return false;
+
+ return true;
+}
+
+void unit_add_to_load_queue(Unit *u) {
+ assert(u);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ if (u->load_state != UNIT_STUB || u->in_load_queue)
+ return;
+
+ LIST_PREPEND(load_queue, u->manager->load_queue, u);
+ u->in_load_queue = true;
+}
+
+void unit_add_to_cleanup_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_cleanup_queue)
+ return;
+
+ LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u);
+ u->in_cleanup_queue = true;
+}
+
+void unit_add_to_gc_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_gc_queue || u->in_cleanup_queue)
+ return;
+
+ if (!unit_may_gc(u))
+ return;
+
+ LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u);
+ u->in_gc_queue = true;
+}
+
+void unit_add_to_dbus_queue(Unit *u) {
+ assert(u);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ if (u->load_state == UNIT_STUB || u->in_dbus_queue)
+ return;
+
+ /* Shortcut things if nobody cares */
+ if (sd_bus_track_count(u->manager->subscribed) <= 0 &&
+ sd_bus_track_count(u->bus_track) <= 0 &&
+ set_isempty(u->manager->private_buses)) {
+ u->sent_dbus_new_signal = true;
+ return;
+ }
+
+ LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u);
+ u->in_dbus_queue = true;
+}
+
+void unit_submit_to_stop_when_unneeded_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_stop_when_unneeded_queue)
+ return;
+
+ if (!u->stop_when_unneeded)
+ return;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+ u->in_stop_when_unneeded_queue = true;
+}
+
+void unit_submit_to_start_when_upheld_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_start_when_upheld_queue)
+ return;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ return;
+
+ if (!unit_has_dependency(u, UNIT_ATOM_START_STEADILY, NULL))
+ return;
+
+ LIST_PREPEND(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+ u->in_start_when_upheld_queue = true;
+}
+
+void unit_submit_to_stop_when_bound_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_stop_when_bound_queue)
+ return;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ if (!unit_has_dependency(u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT, NULL))
+ return;
+
+ LIST_PREPEND(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+ u->in_stop_when_bound_queue = true;
+}
+
+static void unit_clear_dependencies(Unit *u) {
+ assert(u);
+
+ /* Removes all dependencies configured on u and their reverse dependencies. */
+
+ for (Hashmap *deps; (deps = hashmap_steal_first(u->dependencies));) {
+
+ for (Unit *other; (other = hashmap_steal_first_key(deps));) {
+ Hashmap *other_deps;
+
+ HASHMAP_FOREACH(other_deps, other->dependencies)
+ hashmap_remove(other_deps, u);
+
+ unit_add_to_gc_queue(other);
+ }
+
+ hashmap_free(deps);
+ }
+
+ u->dependencies = hashmap_free(u->dependencies);
+}
+
+static void unit_remove_transient(Unit *u) {
+ assert(u);
+
+ if (!u->transient)
+ return;
+
+ if (u->fragment_path)
+ (void) unlink(u->fragment_path);
+
+ STRV_FOREACH(i, u->dropin_paths) {
+ _cleanup_free_ char *p = NULL, *pp = NULL;
+
+ if (path_extract_directory(*i, &p) < 0) /* Get the drop-in directory from the drop-in file */
+ continue;
+
+ if (path_extract_directory(p, &pp) < 0) /* Get the config directory from the drop-in directory */
+ continue;
+
+ /* Only drop transient drop-ins */
+ if (!path_equal(u->manager->lookup_paths.transient, pp))
+ continue;
+
+ (void) unlink(*i);
+ (void) rmdir(p);
+ }
+}
+
+static void unit_free_requires_mounts_for(Unit *u) {
+ assert(u);
+
+ for (;;) {
+ _cleanup_free_ char *path = NULL;
+
+ path = hashmap_steal_first_key(u->requires_mounts_for);
+ if (!path)
+ break;
+ else {
+ char s[strlen(path) + 1];
+
+ PATH_FOREACH_PREFIX_MORE(s, path) {
+ char *y;
+ Set *x;
+
+ x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y);
+ if (!x)
+ continue;
+
+ (void) set_remove(x, u);
+
+ if (set_isempty(x)) {
+ (void) hashmap_remove(u->manager->units_requiring_mounts_for, y);
+ free(y);
+ set_free(x);
+ }
+ }
+ }
+ }
+
+ u->requires_mounts_for = hashmap_free(u->requires_mounts_for);
+}
+
+static void unit_done(Unit *u) {
+ ExecContext *ec;
+ CGroupContext *cc;
+
+ assert(u);
+
+ if (u->type < 0)
+ return;
+
+ if (UNIT_VTABLE(u)->done)
+ UNIT_VTABLE(u)->done(u);
+
+ ec = unit_get_exec_context(u);
+ if (ec)
+ exec_context_done(ec);
+
+ cc = unit_get_cgroup_context(u);
+ if (cc)
+ cgroup_context_done(cc);
+}
+
+Unit* unit_free(Unit *u) {
+ Unit *slice;
+ char *t;
+
+ if (!u)
+ return NULL;
+
+ sd_event_source_disable_unref(u->auto_start_stop_event_source);
+
+ u->transient_file = safe_fclose(u->transient_file);
+
+ if (!MANAGER_IS_RELOADING(u->manager))
+ unit_remove_transient(u);
+
+ bus_unit_send_removed_signal(u);
+
+ unit_done(u);
+
+ unit_dequeue_rewatch_pids(u);
+
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+ u->deserialized_refs = strv_free(u->deserialized_refs);
+ u->pending_freezer_message = sd_bus_message_unref(u->pending_freezer_message);
+
+ unit_free_requires_mounts_for(u);
+
+ SET_FOREACH(t, u->aliases)
+ hashmap_remove_value(u->manager->units, t, u);
+ if (u->id)
+ hashmap_remove_value(u->manager->units, u->id, u);
+
+ if (!sd_id128_is_null(u->invocation_id))
+ hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+ if (u->job) {
+ Job *j = u->job;
+ job_uninstall(j);
+ job_free(j);
+ }
+
+ if (u->nop_job) {
+ Job *j = u->nop_job;
+ job_uninstall(j);
+ job_free(j);
+ }
+
+ /* A unit is being dropped from the tree, make sure our family is realized properly. Do this after we
+ * detach the unit from slice tree in order to eliminate its effect on controller masks. */
+ slice = UNIT_GET_SLICE(u);
+ unit_clear_dependencies(u);
+ if (slice)
+ unit_add_family_to_cgroup_realize_queue(slice);
+
+ if (u->on_console)
+ manager_unref_console(u->manager);
+
+
+ fdset_free(u->initial_socket_bind_link_fds);
+#if BPF_FRAMEWORK
+ bpf_link_free(u->ipv4_socket_bind_link);
+ bpf_link_free(u->ipv6_socket_bind_link);
+#endif
+
+ unit_release_cgroup(u);
+
+ if (!MANAGER_IS_RELOADING(u->manager))
+ unit_unlink_state_files(u);
+
+ unit_unref_uid_gid(u, false);
+
+ (void) manager_update_failed_units(u->manager, u, false);
+ set_remove(u->manager->startup_units, u);
+
+ unit_unwatch_all_pids(u);
+
+ while (u->refs_by_target)
+ unit_ref_unset(u->refs_by_target);
+
+ if (u->type != _UNIT_TYPE_INVALID)
+ LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u);
+
+ if (u->in_load_queue)
+ LIST_REMOVE(load_queue, u->manager->load_queue, u);
+
+ if (u->in_dbus_queue)
+ LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+
+ if (u->in_cleanup_queue)
+ LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u);
+
+ if (u->in_gc_queue)
+ LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u);
+
+ if (u->in_cgroup_realize_queue)
+ LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+
+ if (u->in_cgroup_empty_queue)
+ LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+
+ if (u->in_cgroup_oom_queue)
+ LIST_REMOVE(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+
+ if (u->in_target_deps_queue)
+ LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u);
+
+ if (u->in_stop_when_unneeded_queue)
+ LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+
+ if (u->in_start_when_upheld_queue)
+ LIST_REMOVE(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+
+ if (u->in_stop_when_bound_queue)
+ LIST_REMOVE(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+
+ bpf_firewall_close(u);
+
+ hashmap_free(u->bpf_foreign_by_key);
+
+ bpf_program_free(u->bpf_device_control_installed);
+
+#if BPF_FRAMEWORK
+ bpf_link_free(u->restrict_ifaces_ingress_bpf_link);
+ bpf_link_free(u->restrict_ifaces_egress_bpf_link);
+#endif
+ fdset_free(u->initial_restric_ifaces_link_fds);
+
+ condition_free_list(u->conditions);
+ condition_free_list(u->asserts);
+
+ free(u->description);
+ strv_free(u->documentation);
+ free(u->fragment_path);
+ free(u->source_path);
+ strv_free(u->dropin_paths);
+ free(u->instance);
+
+ free(u->job_timeout_reboot_arg);
+ free(u->reboot_arg);
+
+ free(u->access_selinux_context);
+
+ set_free_free(u->aliases);
+ free(u->id);
+
+ activation_details_unref(u->activation_details);
+
+ return mfree(u);
+}
+
+FreezerState unit_freezer_state(Unit *u) {
+ assert(u);
+
+ return u->freezer_state;
+}
+
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret) {
+ char *values[1] = {};
+ int r;
+
+ assert(u);
+
+ r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+ STRV_MAKE("frozen"), values);
+ if (r < 0)
+ return r;
+
+ r = _FREEZER_STATE_INVALID;
+
+ if (values[0]) {
+ if (streq(values[0], "0"))
+ r = FREEZER_RUNNING;
+ else if (streq(values[0], "1"))
+ r = FREEZER_FROZEN;
+ }
+
+ free(values[0]);
+ *ret = r;
+
+ return 0;
+}
+
+UnitActiveState unit_active_state(Unit *u) {
+ assert(u);
+
+ if (u->load_state == UNIT_MERGED)
+ return unit_active_state(unit_follow_merge(u));
+
+ /* After a reload it might happen that a unit is not correctly
+ * loaded but still has a process around. That's why we won't
+ * shortcut failed loading to UNIT_INACTIVE_FAILED. */
+
+ return UNIT_VTABLE(u)->active_state(u);
+}
+
+const char* unit_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return UNIT_VTABLE(u)->sub_state_to_string(u);
+}
+
+static int unit_merge_names(Unit *u, Unit *other) {
+ char *name;
+ int r;
+
+ assert(u);
+ assert(other);
+
+ r = unit_add_alias(u, other->id);
+ if (r < 0)
+ return r;
+
+ r = set_move(u->aliases, other->aliases);
+ if (r < 0) {
+ set_remove(u->aliases, other->id);
+ return r;
+ }
+
+ TAKE_PTR(other->id);
+ other->aliases = set_free_free(other->aliases);
+
+ SET_FOREACH(name, u->aliases)
+ assert_se(hashmap_replace(u->manager->units, name, u) == 0);
+
+ return 0;
+}
+
+static int unit_reserve_dependencies(Unit *u, Unit *other) {
+ size_t n_reserve;
+ Hashmap* deps;
+ void *d;
+ int r;
+
+ assert(u);
+ assert(other);
+
+ /* Let's reserve some space in the dependency hashmaps so that later on merging the units cannot
+ * fail.
+ *
+ * First make some room in the per dependency type hashmaps. Using the summed size of both units'
+ * hashmaps is an estimate that is likely too high since they probably use some of the same
+ * types. But it's never too low, and that's all we need. */
+
+ n_reserve = MIN(hashmap_size(other->dependencies), LESS_BY((size_t) _UNIT_DEPENDENCY_MAX, hashmap_size(u->dependencies)));
+ if (n_reserve > 0) {
+ r = hashmap_ensure_allocated(&u->dependencies, NULL);
+ if (r < 0)
+ return r;
+
+ r = hashmap_reserve(u->dependencies, n_reserve);
+ if (r < 0)
+ return r;
+ }
+
+ /* Now, enlarge our per dependency type hashmaps by the number of entries in the same hashmap of the
+ * other unit's dependencies.
+ *
+ * NB: If u does not have a dependency set allocated for some dependency type, there is no need to
+ * reserve anything for. In that case other's set will be transferred as a whole to u by
+ * complete_move(). */
+
+ HASHMAP_FOREACH_KEY(deps, d, u->dependencies) {
+ Hashmap *other_deps;
+
+ other_deps = hashmap_get(other->dependencies, d);
+
+ r = hashmap_reserve(deps, hashmap_size(other_deps));
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static bool unit_should_warn_about_dependency(UnitDependency dependency) {
+ /* Only warn about some unit types */
+ return IN_SET(dependency,
+ UNIT_CONFLICTS,
+ UNIT_CONFLICTED_BY,
+ UNIT_BEFORE,
+ UNIT_AFTER,
+ UNIT_ON_SUCCESS,
+ UNIT_ON_FAILURE,
+ UNIT_TRIGGERS,
+ UNIT_TRIGGERED_BY);
+}
+
+static int unit_per_dependency_type_hashmap_update(
+ Hashmap *per_type,
+ Unit *other,
+ UnitDependencyMask origin_mask,
+ UnitDependencyMask destination_mask) {
+
+ UnitDependencyInfo info;
+ int r;
+
+ assert(other);
+ assert_cc(sizeof(void*) == sizeof(info));
+
+ /* Acquire the UnitDependencyInfo entry for the Unit* we are interested in, and update it if it
+ * exists, or insert it anew if not. */
+
+ info.data = hashmap_get(per_type, other);
+ if (info.data) {
+ /* Entry already exists. Add in our mask. */
+
+ if (FLAGS_SET(origin_mask, info.origin_mask) &&
+ FLAGS_SET(destination_mask, info.destination_mask))
+ return 0; /* NOP */
+
+ info.origin_mask |= origin_mask;
+ info.destination_mask |= destination_mask;
+
+ r = hashmap_update(per_type, other, info.data);
+ } else {
+ info = (UnitDependencyInfo) {
+ .origin_mask = origin_mask,
+ .destination_mask = destination_mask,
+ };
+
+ r = hashmap_put(per_type, other, info.data);
+ }
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int unit_add_dependency_hashmap(
+ Hashmap **dependencies,
+ UnitDependency d,
+ Unit *other,
+ UnitDependencyMask origin_mask,
+ UnitDependencyMask destination_mask) {
+
+ Hashmap *per_type;
+ int r;
+
+ assert(dependencies);
+ assert(other);
+ assert(origin_mask < _UNIT_DEPENDENCY_MASK_FULL);
+ assert(destination_mask < _UNIT_DEPENDENCY_MASK_FULL);
+ assert(origin_mask > 0 || destination_mask > 0);
+
+ /* Ensure the top-level dependency hashmap exists that maps UnitDependency → Hashmap(Unit* →
+ * UnitDependencyInfo) */
+ r = hashmap_ensure_allocated(dependencies, NULL);
+ if (r < 0)
+ return r;
+
+ /* Acquire the inner hashmap, that maps Unit* → UnitDependencyInfo, for the specified dependency
+ * type, and if it's missing allocate it and insert it. */
+ per_type = hashmap_get(*dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+ if (!per_type) {
+ per_type = hashmap_new(NULL);
+ if (!per_type)
+ return -ENOMEM;
+
+ r = hashmap_put(*dependencies, UNIT_DEPENDENCY_TO_PTR(d), per_type);
+ if (r < 0) {
+ hashmap_free(per_type);
+ return r;
+ }
+ }
+
+ return unit_per_dependency_type_hashmap_update(per_type, other, origin_mask, destination_mask);
+}
+
+static void unit_merge_dependencies(Unit *u, Unit *other) {
+ Hashmap *deps;
+ void *dt; /* Actually of type UnitDependency, except that we don't bother casting it here,
+ * since the hashmaps all want it as void pointer. */
+
+ assert(u);
+ assert(other);
+
+ if (u == other)
+ return;
+
+ /* First, remove dependency to other. */
+ HASHMAP_FOREACH_KEY(deps, dt, u->dependencies) {
+ if (hashmap_remove(deps, other) && unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+ log_unit_warning(u, "Dependency %s=%s is dropped, as %s is merged into %s.",
+ unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+ other->id, other->id, u->id);
+
+ if (hashmap_isempty(deps))
+ hashmap_free(hashmap_remove(u->dependencies, dt));
+ }
+
+ for (;;) {
+ _cleanup_(hashmap_freep) Hashmap *other_deps = NULL;
+ UnitDependencyInfo di_back;
+ Unit *back;
+
+ /* Let's focus on one dependency type at a time, that 'other' has defined. */
+ other_deps = hashmap_steal_first_key_and_value(other->dependencies, &dt);
+ if (!other_deps)
+ break; /* done! */
+
+ deps = hashmap_get(u->dependencies, dt);
+
+ /* Now iterate through all dependencies of this dependency type, of 'other'. We refer to the
+ * referenced units as 'back'. */
+ HASHMAP_FOREACH_KEY(di_back.data, back, other_deps) {
+ Hashmap *back_deps;
+ void *back_dt;
+
+ if (back == u) {
+ /* This is a dependency pointing back to the unit we want to merge with?
+ * Suppress it (but warn) */
+ if (unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+ log_unit_warning(u, "Dependency %s=%s in %s is dropped, as %s is merged into %s.",
+ unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+ u->id, other->id, other->id, u->id);
+
+ hashmap_remove(other_deps, back);
+ continue;
+ }
+
+ /* Now iterate through all deps of 'back', and fix the ones pointing to 'other' to
+ * point to 'u' instead. */
+ HASHMAP_FOREACH_KEY(back_deps, back_dt, back->dependencies) {
+ UnitDependencyInfo di_move;
+
+ di_move.data = hashmap_remove(back_deps, other);
+ if (!di_move.data)
+ continue;
+
+ assert_se(unit_per_dependency_type_hashmap_update(
+ back_deps,
+ u,
+ di_move.origin_mask,
+ di_move.destination_mask) >= 0);
+ }
+
+ /* The target unit already has dependencies of this type, let's then merge this individually. */
+ if (deps)
+ assert_se(unit_per_dependency_type_hashmap_update(
+ deps,
+ back,
+ di_back.origin_mask,
+ di_back.destination_mask) >= 0);
+ }
+
+ /* Now all references towards 'other' of the current type 'dt' are corrected to point to 'u'.
+ * Lets's now move the deps of type 'dt' from 'other' to 'u'. If the unit does not have
+ * dependencies of this type, let's move them per type wholesale. */
+ if (!deps)
+ assert_se(hashmap_put(u->dependencies, dt, TAKE_PTR(other_deps)) >= 0);
+ }
+
+ other->dependencies = hashmap_free(other->dependencies);
+}
+
+int unit_merge(Unit *u, Unit *other) {
+ int r;
+
+ assert(u);
+ assert(other);
+ assert(u->manager == other->manager);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ other = unit_follow_merge(other);
+
+ if (other == u)
+ return 0;
+
+ if (u->type != other->type)
+ return -EINVAL;
+
+ if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */
+ return -EEXIST;
+
+ if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND))
+ return -EEXIST;
+
+ if (!streq_ptr(u->instance, other->instance))
+ return -EINVAL;
+
+ if (other->job)
+ return -EEXIST;
+
+ if (other->nop_job)
+ return -EEXIST;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+ return -EEXIST;
+
+ /* Make reservations to ensure merge_dependencies() won't fail. We don't rollback reservations if we
+ * fail. We don't have a way to undo reservations. A reservation is not a leak. */
+ r = unit_reserve_dependencies(u, other);
+ if (r < 0)
+ return r;
+
+ /* Redirect all references */
+ while (other->refs_by_target)
+ unit_ref_set(other->refs_by_target, other->refs_by_target->source, u);
+
+ /* Merge dependencies */
+ unit_merge_dependencies(u, other);
+
+ /* Merge names. It is better to do that after merging deps, otherwise the log message contains n/a. */
+ r = unit_merge_names(u, other);
+ if (r < 0)
+ return r;
+
+ other->load_state = UNIT_MERGED;
+ other->merged_into = u;
+
+ if (!u->activation_details)
+ u->activation_details = activation_details_ref(other->activation_details);
+
+ /* If there is still some data attached to the other node, we
+ * don't need it anymore, and can free it. */
+ if (other->load_state != UNIT_STUB)
+ if (UNIT_VTABLE(other)->done)
+ UNIT_VTABLE(other)->done(other);
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_cleanup_queue(other);
+
+ return 0;
+}
+
+int unit_merge_by_name(Unit *u, const char *name) {
+ _cleanup_free_ char *s = NULL;
+ Unit *other;
+ int r;
+
+ /* Either add name to u, or if a unit with name already exists, merge it with u.
+ * If name is a template, do the same for name@instance, where instance is u's instance. */
+
+ assert(u);
+ assert(name);
+
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return -EINVAL;
+
+ r = unit_name_replace_instance(name, u->instance, &s);
+ if (r < 0)
+ return r;
+
+ name = s;
+ }
+
+ other = manager_get_unit(u->manager, name);
+ if (other)
+ return unit_merge(u, other);
+
+ return unit_add_name(u, name);
+}
+
+Unit* unit_follow_merge(Unit *u) {
+ assert(u);
+
+ while (u->load_state == UNIT_MERGED)
+ assert_se(u = u->merged_into);
+
+ return u;
+}
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
+ int r;
+
+ assert(u);
+ assert(c);
+
+ /* Unlike unit_add_dependency() or friends, this always returns 0 on success. */
+
+ if (c->working_directory && !c->working_directory_missing_ok) {
+ r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_directory) {
+ r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_image) {
+ r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ if (!u->manager->prefix[dt])
+ continue;
+
+ for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path);
+ if (!p)
+ return -ENOMEM;
+
+ r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+
+ /* For the following three directory types we need write access, and /var/ is possibly on the root
+ * fs. Hence order after systemd-remount-fs.service, to ensure things are writable. */
+ if (c->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+ c->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+ c->directories[EXEC_DIRECTORY_LOGS].n_items > 0) {
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->private_tmp) {
+
+ /* FIXME: for now we make a special case for /tmp and add a weak dependency on
+ * tmp.mount so /tmp being masked is supported. However there's no reason to treat
+ * /tmp specifically and masking other mount units should be handled more
+ * gracefully too, see PR#16894. */
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_image) {
+ /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an
+ * implicit dependency on udev */
+
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(c->std_output,
+ EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+ !IN_SET(c->std_error,
+ EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+ !c->log_namespace)
+ return 0;
+
+ /* If syslog or kernel logging is requested (or log namespacing is), make sure our own logging daemon
+ * is run first. */
+
+ if (c->log_namespace) {
+ _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL;
+
+ r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ } else
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+const char* unit_description(Unit *u) {
+ assert(u);
+
+ if (u->description)
+ return u->description;
+
+ return strna(u->id);
+}
+
+const char* unit_status_string(Unit *u, char **ret_combined_buffer) {
+ assert(u);
+ assert(u->id);
+
+ /* Return u->id, u->description, or "{u->id} - {u->description}".
+ * Versions with u->description are only used if it is set.
+ * The last option is used if configured and the caller provided the 'ret_combined_buffer'
+ * pointer.
+ *
+ * Note that *ret_combined_buffer may be set to NULL. */
+
+ if (!u->description ||
+ u->manager->status_unit_format == STATUS_UNIT_FORMAT_NAME ||
+ (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && !ret_combined_buffer) ||
+ streq(u->description, u->id)) {
+
+ if (ret_combined_buffer)
+ *ret_combined_buffer = NULL;
+ return u->id;
+ }
+
+ if (ret_combined_buffer) {
+ if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED) {
+ *ret_combined_buffer = strjoin(u->id, " - ", u->description);
+ if (*ret_combined_buffer)
+ return *ret_combined_buffer;
+ log_oom(); /* Fall back to ->description */
+ } else
+ *ret_combined_buffer = NULL;
+ }
+
+ return u->description;
+}
+
+/* Common implementation for multiple backends */
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required) {
+ int r;
+
+ assert(u);
+
+ /* Load a .{service,socket,...} file */
+ r = unit_load_fragment(u);
+ if (r < 0)
+ return r;
+
+ if (u->load_state == UNIT_STUB) {
+ if (fragment_required)
+ return -ENOENT;
+
+ u->load_state = UNIT_LOADED;
+ }
+
+ /* Load drop-in directory data. If u is an alias, we might be reloading the
+ * target unit needlessly. But we cannot be sure which drops-ins have already
+ * been loaded and which not, at least without doing complicated book-keeping,
+ * so let's always reread all drop-ins. */
+ r = unit_load_dropin(unit_follow_merge(u));
+ if (r < 0)
+ return r;
+
+ if (u->source_path) {
+ struct stat st;
+
+ if (stat(u->source_path, &st) >= 0)
+ u->source_mtime = timespec_load(&st.st_mtim);
+ else
+ u->source_mtime = 0;
+ }
+
+ return 0;
+}
+
+void unit_add_to_target_deps_queue(Unit *u) {
+ Manager *m = ASSERT_PTR(ASSERT_PTR(u)->manager);
+
+ if (u->in_target_deps_queue)
+ return;
+
+ LIST_PREPEND(target_deps_queue, m->target_deps_queue, u);
+ u->in_target_deps_queue = true;
+}
+
+int unit_add_default_target_dependency(Unit *u, Unit *target) {
+ assert(u);
+ assert(target);
+
+ if (target->type != UNIT_TARGET)
+ return 0;
+
+ /* Only add the dependency if both units are loaded, so that
+ * that loop check below is reliable */
+ if (u->load_state != UNIT_LOADED ||
+ target->load_state != UNIT_LOADED)
+ return 0;
+
+ /* If either side wants no automatic dependencies, then let's
+ * skip this */
+ if (!u->default_dependencies ||
+ !target->default_dependencies)
+ return 0;
+
+ /* Don't create loops */
+ if (unit_has_dependency(target, UNIT_ATOM_BEFORE, u))
+ return 0;
+
+ return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int unit_add_slice_dependencies(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the
+ name), while all other units are ordered based on configuration (as in their case Slice= configures the
+ relationship). */
+ UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE;
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask);
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask);
+}
+
+static int unit_add_mount_dependencies(Unit *u) {
+ UnitDependencyInfo di;
+ const char *path;
+ bool changed = false;
+ int r;
+
+ assert(u);
+
+ HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+ char prefix[strlen(path) + 1];
+
+ PATH_FOREACH_PREFIX_MORE(prefix, path) {
+ _cleanup_free_ char *p = NULL;
+ Unit *m;
+
+ r = unit_name_from_path(prefix, ".mount", &p);
+ if (r == -EINVAL)
+ continue; /* If the path cannot be converted to a mount unit name, then it's
+ * not manageable as a unit by systemd, and hence we don't need a
+ * dependency on it. Let's thus silently ignore the issue. */
+ if (r < 0)
+ return r;
+
+ m = manager_get_unit(u->manager, p);
+ if (!m) {
+ /* Make sure to load the mount unit if it exists. If so the dependencies on
+ * this unit will be added later during the loading of the mount unit. */
+ (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m);
+ continue;
+ }
+ if (m == u)
+ continue;
+
+ if (m->load_state != UNIT_LOADED)
+ continue;
+
+ r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask);
+ if (r < 0)
+ return r;
+ changed = changed || r > 0;
+
+ if (m->fragment_path) {
+ r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask);
+ if (r < 0)
+ return r;
+ changed = changed || r > 0;
+ }
+ }
+ }
+
+ return changed;
+}
+
+static int unit_add_oomd_dependencies(Unit *u) {
+ CGroupContext *c;
+ CGroupMask mask;
+ int r;
+
+ assert(u);
+
+ if (!u->default_dependencies)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL;
+ if (!wants_oomd)
+ return 0;
+
+ if (!cg_all_unified())
+ return 0;
+
+ r = cg_mask_supported(&mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine supported controllers: %m");
+
+ if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+ return 0;
+
+ return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE);
+}
+
+static int unit_add_startup_units(Unit *u) {
+ if (!unit_has_startup_cgroup_constraints(u))
+ return 0;
+
+ return set_ensure_put(&u->manager->startup_units, NULL, u);
+}
+
+static int unit_validate_on_failure_job_mode(
+ Unit *u,
+ const char *job_mode_setting,
+ JobMode job_mode,
+ const char *dependency_name,
+ UnitDependencyAtom atom) {
+
+ Unit *other, *found = NULL;
+
+ if (job_mode != JOB_ISOLATE)
+ return 0;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ if (!found)
+ found = other;
+ else if (found != other)
+ return log_unit_error_errno(
+ u, SYNTHETIC_ERRNO(ENOEXEC),
+ "More than one %s dependencies specified but %sisolate set. Refusing.",
+ dependency_name, job_mode_setting);
+ }
+
+ return 0;
+}
+
+int unit_load(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->in_load_queue) {
+ LIST_REMOVE(load_queue, u->manager->load_queue, u);
+ u->in_load_queue = false;
+ }
+
+ if (u->type == _UNIT_TYPE_INVALID)
+ return -EINVAL;
+
+ if (u->load_state != UNIT_STUB)
+ return 0;
+
+ if (u->transient_file) {
+ /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup
+ * is complete, hence let's synchronize the unit file we just wrote to disk. */
+
+ r = fflush_and_check(u->transient_file);
+ if (r < 0)
+ goto fail;
+
+ u->transient_file = safe_fclose(u->transient_file);
+ u->fragment_mtime = now(CLOCK_REALTIME);
+ }
+
+ r = UNIT_VTABLE(u)->load(u);
+ if (r < 0)
+ goto fail;
+
+ assert(u->load_state != UNIT_STUB);
+
+ if (u->load_state == UNIT_LOADED) {
+ unit_add_to_target_deps_queue(u);
+
+ r = unit_add_slice_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_mount_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_oomd_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_startup_units(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_validate_on_failure_job_mode(u, "OnSuccessJobMode=", u->on_success_job_mode, "OnSuccess=", UNIT_ATOM_ON_SUCCESS);
+ if (r < 0)
+ goto fail;
+
+ r = unit_validate_on_failure_job_mode(u, "OnFailureJobMode=", u->on_failure_job_mode, "OnFailure=", UNIT_ATOM_ON_FAILURE);
+ if (r < 0)
+ goto fail;
+
+ if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout)
+ log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect.");
+
+ /* We finished loading, let's ensure our parents recalculate the members mask */
+ unit_invalidate_cgroup_members_masks(u);
+ }
+
+ assert((u->load_state != UNIT_MERGED) == !u->merged_into);
+
+ unit_add_to_dbus_queue(unit_follow_merge(u));
+ unit_add_to_gc_queue(u);
+ (void) manager_varlink_send_managed_oom_update(u);
+
+ return 0;
+
+fail:
+ /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code
+ * should hence return ENOEXEC to ensure units are placed in this state after loading. */
+
+ u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND :
+ r == -ENOEXEC ? UNIT_BAD_SETTING :
+ UNIT_ERROR;
+ u->load_error = r;
+
+ /* Record the timestamp on the cache, so that if the cache gets updated between now and the next time
+ * an attempt is made to load this unit, we know we need to check again. */
+ if (u->load_state == UNIT_NOT_FOUND)
+ u->fragment_not_found_timestamp_hash = u->manager->unit_cache_timestamp_hash;
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_gc_queue(u);
+
+ return log_unit_debug_errno(u, r, "Failed to load configuration: %m");
+}
+
+_printf_(7, 8)
+static int log_unit_internal(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) {
+ Unit *u = userdata;
+ va_list ap;
+ int r;
+
+ if (u && !unit_log_level_test(u, level))
+ return -ERRNO_VALUE(error);
+
+ va_start(ap, format);
+ if (u)
+ r = log_object_internalv(level, error, file, line, func,
+ u->manager->unit_log_field,
+ u->id,
+ u->manager->invocation_log_field,
+ u->invocation_id_string,
+ format, ap);
+ else
+ r = log_internalv(level, error, file, line, func, format, ap);
+ va_end(ap);
+
+ return r;
+}
+
+static bool unit_test_condition(Unit *u) {
+ _cleanup_strv_free_ char **env = NULL;
+ int r;
+
+ assert(u);
+
+ dual_timestamp_get(&u->condition_timestamp);
+
+ r = manager_get_effective_environment(u->manager, &env);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+ u->condition_result = true;
+ } else
+ u->condition_result = condition_test_list(
+ u->conditions,
+ env,
+ condition_type_to_string,
+ log_unit_internal,
+ u);
+
+ unit_add_to_dbus_queue(u);
+ return u->condition_result;
+}
+
+static bool unit_test_assert(Unit *u) {
+ _cleanup_strv_free_ char **env = NULL;
+ int r;
+
+ assert(u);
+
+ dual_timestamp_get(&u->assert_timestamp);
+
+ r = manager_get_effective_environment(u->manager, &env);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+ u->assert_result = CONDITION_ERROR;
+ } else
+ u->assert_result = condition_test_list(
+ u->asserts,
+ env,
+ assert_type_to_string,
+ log_unit_internal,
+ u);
+
+ unit_add_to_dbus_queue(u);
+ return u->assert_result;
+}
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) {
+ if (log_get_show_color()) {
+ if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && strchr(ident, ' '))
+ ident = strjoina(ANSI_HIGHLIGHT, u->id, ANSI_NORMAL, " - ", u->description);
+ else
+ ident = strjoina(ANSI_HIGHLIGHT, ident, ANSI_NORMAL);
+ }
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ manager_status_printf(u->manager, status_type, status, format, ident);
+ REENABLE_WARNING;
+}
+
+int unit_test_start_limit(Unit *u) {
+ const char *reason;
+
+ assert(u);
+
+ if (ratelimit_below(&u->start_ratelimit)) {
+ u->start_limit_hit = false;
+ return 0;
+ }
+
+ log_unit_warning(u, "Start request repeated too quickly.");
+ u->start_limit_hit = true;
+
+ reason = strjoina("unit ", u->id, " failed");
+
+ emergency_action(u->manager, u->start_limit_action,
+ EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+ u->reboot_arg, -1, reason);
+
+ return -ECANCELED;
+}
+
+bool unit_shall_confirm_spawn(Unit *u) {
+ assert(u);
+
+ if (manager_is_confirm_spawn_disabled(u->manager))
+ return false;
+
+ /* For some reasons units remaining in the same process group
+ * as PID 1 fail to acquire the console even if it's not used
+ * by any process. So skip the confirmation question for them. */
+ return !unit_get_exec_context(u)->same_pgrp;
+}
+
+static bool unit_verify_deps(Unit *u) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined
+ * with After=. We do not check Requires= or Requisite= here as they only should have an effect on
+ * the job processing, but do not have any effect afterwards. We don't check BindsTo= dependencies
+ * that are not used in conjunction with After= as for them any such check would make things entirely
+ * racy. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other))
+ continue;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+ log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Errors that aren't really errors:
+ * -EALREADY: Unit is already started.
+ * -ECOMM: Condition failed
+ * -EAGAIN: An operation is already in progress. Retry later.
+ *
+ * Errors that are real errors:
+ * -EBADR: This unit type does not support starting.
+ * -ECANCELED: Start limit hit, too many requests for now
+ * -EPROTO: Assert failed
+ * -EINVAL: Unit not loaded
+ * -EOPNOTSUPP: Unit type not supported
+ * -ENOLINK: The necessary dependencies are not fulfilled.
+ * -ESTALE: This unit has been started before and can't be started a second time
+ * -ENOENT: This is a triggering unit and unit to trigger is not loaded
+ */
+int unit_start(Unit *u, ActivationDetails *details) {
+ UnitActiveState state;
+ Unit *following;
+ int r;
+
+ assert(u);
+
+ /* Let's hold off running start jobs for mount units when /proc/self/mountinfo monitor is rate limited. */
+ if (u->type == UNIT_MOUNT && sd_event_source_is_ratelimited(u->manager->mount_event_source))
+ return -EAGAIN;
+
+ /* If this is already started, then this will succeed. Note that this will even succeed if this unit
+ * is not startable by the user. This is relied on to detect when we need to wait for units and when
+ * waiting is finished. */
+ state = unit_active_state(u);
+ if (UNIT_IS_ACTIVE_OR_RELOADING(state))
+ return -EALREADY;
+ if (state == UNIT_MAINTENANCE)
+ return -EAGAIN;
+
+ /* Units that aren't loaded cannot be started */
+ if (u->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ /* Refuse starting scope units more than once */
+ if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp))
+ return -ESTALE;
+
+ /* If the conditions failed, don't do anything at all. If we already are activating this call might
+ * still be useful to speed up activation in case there is some hold-off time, but we don't want to
+ * recheck the condition in that case. */
+ if (state != UNIT_ACTIVATING &&
+ !unit_test_condition(u))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(ECOMM), "Starting requested but condition failed. Not starting unit.");
+
+ /* If the asserts failed, fail the entire job */
+ if (state != UNIT_ACTIVATING &&
+ !unit_test_assert(u))
+ return log_unit_notice_errno(u, SYNTHETIC_ERRNO(EPROTO), "Starting requested but asserts failed.");
+
+ /* Units of types that aren't supported cannot be started. Note that we do this test only after the
+ * condition checks, so that we rather return condition check errors (which are usually not
+ * considered a true failure) than "not supported" errors (which are considered a failure).
+ */
+ if (!unit_type_supported(u->type))
+ return -EOPNOTSUPP;
+
+ /* Let's make sure that the deps really are in order before we start this. Normally the job engine
+ * should have taken care of this already, but let's check this here again. After all, our
+ * dependencies might not be in effect anymore, due to a reload or due to an unmet condition. */
+ if (!unit_verify_deps(u))
+ return -ENOLINK;
+
+ /* Forward to the main object, if we aren't it. */
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id);
+ return unit_start(following, details);
+ }
+
+ /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */
+ if (UNIT_VTABLE(u)->can_start) {
+ r = UNIT_VTABLE(u)->can_start(u);
+ if (r < 0)
+ return r;
+ }
+
+ /* If it is stopped, but we cannot start it, then fail */
+ if (!UNIT_VTABLE(u)->start)
+ return -EBADR;
+
+ /* We don't suppress calls to ->start() here when we are already starting, to allow this request to
+ * be used as a "hurry up" call, for example when the unit is in some "auto restart" state where it
+ * waits for a holdoff timer to elapse before it will start again. */
+
+ unit_add_to_dbus_queue(u);
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ if (!u->activation_details) /* Older details object wins */
+ u->activation_details = activation_details_ref(details);
+
+ return UNIT_VTABLE(u)->start(u);
+}
+
+bool unit_can_start(Unit *u) {
+ assert(u);
+
+ if (u->load_state != UNIT_LOADED)
+ return false;
+
+ if (!unit_type_supported(u->type))
+ return false;
+
+ /* Scope units may be started only once */
+ if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp))
+ return false;
+
+ return !!UNIT_VTABLE(u)->start;
+}
+
+bool unit_can_isolate(Unit *u) {
+ assert(u);
+
+ return unit_can_start(u) &&
+ u->allow_isolate;
+}
+
+/* Errors:
+ * -EBADR: This unit type does not support stopping.
+ * -EALREADY: Unit is already stopped.
+ * -EAGAIN: An operation is already in progress. Retry later.
+ */
+int unit_stop(Unit *u) {
+ UnitActiveState state;
+ Unit *following;
+
+ assert(u);
+
+ state = unit_active_state(u);
+ if (UNIT_IS_INACTIVE_OR_FAILED(state))
+ return -EALREADY;
+
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id);
+ return unit_stop(following);
+ }
+
+ if (!UNIT_VTABLE(u)->stop)
+ return -EBADR;
+
+ unit_add_to_dbus_queue(u);
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ return UNIT_VTABLE(u)->stop(u);
+}
+
+bool unit_can_stop(Unit *u) {
+ assert(u);
+
+ /* Note: if we return true here, it does not mean that the unit may be successfully stopped.
+ * Extrinsic units follow external state and they may stop following external state changes
+ * (hence we return true here), but an attempt to do this through the manager will fail. */
+
+ if (!unit_type_supported(u->type))
+ return false;
+
+ if (u->perpetual)
+ return false;
+
+ return !!UNIT_VTABLE(u)->stop;
+}
+
+/* Errors:
+ * -EBADR: This unit type does not support reloading.
+ * -ENOEXEC: Unit is not started.
+ * -EAGAIN: An operation is already in progress. Retry later.
+ */
+int unit_reload(Unit *u) {
+ UnitActiveState state;
+ Unit *following;
+
+ assert(u);
+
+ if (u->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ if (!unit_can_reload(u))
+ return -EBADR;
+
+ state = unit_active_state(u);
+ if (state == UNIT_RELOADING)
+ return -EAGAIN;
+
+ if (state != UNIT_ACTIVE)
+ return log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "Unit cannot be reloaded because it is inactive.");
+
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id);
+ return unit_reload(following);
+ }
+
+ unit_add_to_dbus_queue(u);
+
+ if (!UNIT_VTABLE(u)->reload) {
+ /* Unit doesn't have a reload function, but we need to propagate the reload anyway */
+ unit_notify(u, unit_active_state(u), unit_active_state(u), 0);
+ return 0;
+ }
+
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ return UNIT_VTABLE(u)->reload(u);
+}
+
+bool unit_can_reload(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->can_reload)
+ return UNIT_VTABLE(u)->can_reload(u);
+
+ if (unit_has_dependency(u, UNIT_ATOM_PROPAGATES_RELOAD_TO, NULL))
+ return true;
+
+ return UNIT_VTABLE(u)->reload;
+}
+
+bool unit_is_unneeded(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ if (!u->stop_when_unneeded)
+ return false;
+
+ /* Don't clean up while the unit is transitioning or is even inactive. */
+ if (unit_active_state(u) != UNIT_ACTIVE)
+ return false;
+ if (u->job)
+ return false;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED) {
+ /* If a dependent unit has a job queued, is active or transitioning, or is marked for
+ * restart, then don't clean this one up. */
+
+ if (other->job)
+ return false;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+ return false;
+
+ if (unit_will_restart(other))
+ return false;
+ }
+
+ return true;
+}
+
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks if the unit needs to be started because it currently is not running, but some other unit
+ * that is active declared an Uphold= dependencies on it */
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) || u->job) {
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_START_STEADILY) {
+ if (other->job)
+ continue;
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+ if (ret_culprit)
+ *ret_culprit = other;
+ return true;
+ }
+ }
+
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+}
+
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks whether this unit is bound to another unit that is inactive, i.e. whether we should stop
+ * because the other unit is down. */
+
+ if (unit_active_state(u) != UNIT_ACTIVE || u->job) {
+ /* Don't clean up while the unit is transitioning or is even inactive. */
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+ if (other->job)
+ continue;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ if (ret_culprit)
+ *ret_culprit = other;
+
+ return true;
+ }
+ }
+
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+}
+
+static void check_unneeded_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE)
+ unit_submit_to_stop_when_unneeded_queue(other);
+}
+
+static void check_uphold_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes Uphold= behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE)
+ unit_submit_to_start_when_upheld_queue(other);
+}
+
+static void check_bound_by_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes BindsTo= stop behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE)
+ unit_submit_to_stop_when_bound_queue(other);
+}
+
+static void retroactively_start_dependencies(Unit *u) {
+ Unit *other;
+
+ assert(u);
+ assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)));
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+ !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+ !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */
+ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+static void retroactively_stop_dependencies(Unit *u) {
+ Unit *other;
+
+ assert(u);
+ assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)));
+
+ /* Pull down units which are bound to us recursively if enabled */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */
+ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+void unit_start_on_failure(
+ Unit *u,
+ const char *dependency_name,
+ UnitDependencyAtom atom,
+ JobMode job_mode) {
+
+ int n_jobs = -1;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(dependency_name);
+ assert(IN_SET(atom, UNIT_ATOM_ON_SUCCESS, UNIT_ATOM_ON_FAILURE));
+
+ /* Act on OnFailure= and OnSuccess= dependencies */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ if (n_jobs < 0) {
+ log_unit_info(u, "Triggering %s dependencies.", dependency_name);
+ n_jobs = 0;
+ }
+
+ r = manager_add_job(u->manager, JOB_START, other, job_mode, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(
+ u, r, "Failed to enqueue %s job, ignoring: %s",
+ dependency_name, bus_error_message(&error, r));
+ n_jobs ++;
+ }
+
+ if (n_jobs >= 0)
+ log_unit_debug(u, "Triggering %s dependencies done (%i %s).",
+ dependency_name, n_jobs, n_jobs == 1 ? "job" : "jobs");
+}
+
+void unit_trigger_notify(Unit *u) {
+ Unit *other;
+
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_TRIGGERED_BY)
+ if (UNIT_VTABLE(other)->trigger_notify)
+ UNIT_VTABLE(other)->trigger_notify(other, u);
+}
+
+static int raise_level(int log_level, bool condition_info, bool condition_notice) {
+ if (condition_notice && log_level > LOG_NOTICE)
+ return LOG_NOTICE;
+ if (condition_info && log_level > LOG_INFO)
+ return LOG_INFO;
+ return log_level;
+}
+
+static int unit_log_resources(Unit *u) {
+ struct iovec iovec[1 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4];
+ bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false;
+ _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL;
+ int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */
+ size_t n_message_parts = 0, n_iovec = 0;
+ char* message_parts[1 + 2 + 2 + 1], *t;
+ nsec_t nsec = NSEC_INFINITY;
+ int r;
+ const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES",
+ [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS",
+ [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES",
+ [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS",
+ };
+ const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES",
+ [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES",
+ [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS",
+ [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS",
+ };
+
+ assert(u);
+
+ /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource
+ * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced
+ * information and the complete data in structured fields. */
+
+ (void) unit_get_cpu_usage(u, &nsec);
+ if (nsec != NSEC_INFINITY) {
+ /* Format the CPU time for inclusion in the structured log message */
+ if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the CPU time for inclusion in the human language message string */
+ t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time");
+ if (!t) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = t;
+
+ log_level = raise_level(log_level,
+ nsec > MENTIONWORTHY_CPU_NSEC,
+ nsec > NOTICEWORTHY_CPU_NSEC);
+ }
+
+ for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) {
+ uint64_t value = UINT64_MAX;
+
+ assert(io_fields[k]);
+
+ (void) unit_get_io_accounting(u, k, k > 0, &value);
+ if (value == UINT64_MAX)
+ continue;
+
+ have_io_accounting = true;
+ if (value > 0)
+ any_io = true;
+
+ /* Format IO accounting data for inclusion in the structured log message */
+ if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the IO accounting data for inclusion in the human language message string, but only
+ * for the bytes counters (and not for the operations counters) */
+ if (k == CGROUP_IO_READ_BYTES) {
+ assert(!rr);
+ rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk");
+ if (!rr) {
+ r = log_oom();
+ goto finish;
+ }
+ } else if (k == CGROUP_IO_WRITE_BYTES) {
+ assert(!wr);
+ wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk");
+ if (!wr) {
+ r = log_oom();
+ goto finish;
+ }
+ }
+
+ if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES))
+ log_level = raise_level(log_level,
+ value > MENTIONWORTHY_IO_BYTES,
+ value > NOTICEWORTHY_IO_BYTES);
+ }
+
+ if (have_io_accounting) {
+ if (any_io) {
+ if (rr)
+ message_parts[n_message_parts++] = TAKE_PTR(rr);
+ if (wr)
+ message_parts[n_message_parts++] = TAKE_PTR(wr);
+
+ } else {
+ char *k;
+
+ k = strdup("no IO");
+ if (!k) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = k;
+ }
+ }
+
+ for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+ uint64_t value = UINT64_MAX;
+
+ assert(ip_fields[m]);
+
+ (void) unit_get_ip_accounting(u, m, &value);
+ if (value == UINT64_MAX)
+ continue;
+
+ have_ip_accounting = true;
+ if (value > 0)
+ any_traffic = true;
+
+ /* Format IP accounting data for inclusion in the structured log message */
+ if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the IP accounting data for inclusion in the human language message string, but only for the
+ * bytes counters (and not for the packets counters) */
+ if (m == CGROUP_IP_INGRESS_BYTES) {
+ assert(!igress);
+ igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic");
+ if (!igress) {
+ r = log_oom();
+ goto finish;
+ }
+ } else if (m == CGROUP_IP_EGRESS_BYTES) {
+ assert(!egress);
+ egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic");
+ if (!egress) {
+ r = log_oom();
+ goto finish;
+ }
+ }
+
+ if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+ log_level = raise_level(log_level,
+ value > MENTIONWORTHY_IP_BYTES,
+ value > NOTICEWORTHY_IP_BYTES);
+ }
+
+ /* This check is here because it is the earliest point following all possible log_level assignments. If
+ * log_level is assigned anywhere after this point, move this check. */
+ if (!unit_log_level_test(u, log_level)) {
+ r = 0;
+ goto finish;
+ }
+
+ if (have_ip_accounting) {
+ if (any_traffic) {
+ if (igress)
+ message_parts[n_message_parts++] = TAKE_PTR(igress);
+ if (egress)
+ message_parts[n_message_parts++] = TAKE_PTR(egress);
+
+ } else {
+ char *k;
+
+ k = strdup("no IP traffic");
+ if (!k) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = k;
+ }
+ }
+
+ /* Is there any accounting data available at all? */
+ if (n_iovec == 0) {
+ r = 0;
+ goto finish;
+ }
+
+ if (n_message_parts == 0)
+ t = strjoina("MESSAGE=", u->id, ": Completed.");
+ else {
+ _cleanup_free_ char *joined = NULL;
+
+ message_parts[n_message_parts] = NULL;
+
+ joined = strv_join(message_parts, ", ");
+ if (!joined) {
+ r = log_oom();
+ goto finish;
+ }
+
+ joined[0] = ascii_toupper(joined[0]);
+ t = strjoina("MESSAGE=", u->id, ": ", joined, ".");
+ }
+
+ /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them,
+ * and hence don't increase n_iovec for them */
+ iovec[n_iovec] = IOVEC_MAKE_STRING(t);
+ iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR);
+
+ t = strjoina(u->manager->unit_log_field, u->id);
+ iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t);
+
+ t = strjoina(u->manager->invocation_log_field, u->invocation_id_string);
+ iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t);
+
+ log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4);
+ r = 0;
+
+finish:
+ for (size_t i = 0; i < n_message_parts; i++)
+ free(message_parts[i]);
+
+ for (size_t i = 0; i < n_iovec; i++)
+ free(iovec[i].iov_base);
+
+ return r;
+
+}
+
+static void unit_update_on_console(Unit *u) {
+ bool b;
+
+ assert(u);
+
+ b = unit_needs_console(u);
+ if (u->on_console == b)
+ return;
+
+ u->on_console = b;
+ if (b)
+ manager_ref_console(u->manager);
+ else
+ manager_unref_console(u->manager);
+}
+
+static void unit_emit_audit_start(Unit *u) {
+ assert(u);
+
+ if (u->type != UNIT_SERVICE)
+ return;
+
+ /* Write audit record if we have just finished starting up */
+ manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, true);
+ u->in_audit = true;
+}
+
+static void unit_emit_audit_stop(Unit *u, UnitActiveState state) {
+ assert(u);
+
+ if (u->type != UNIT_SERVICE)
+ return;
+
+ if (u->in_audit) {
+ /* Write audit record if we have just finished shutting down */
+ manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, state == UNIT_INACTIVE);
+ u->in_audit = false;
+ } else {
+ /* Hmm, if there was no start record written write it now, so that we always have a nice pair */
+ manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_START, state == UNIT_INACTIVE);
+
+ if (state == UNIT_INACTIVE)
+ manager_send_unit_audit(u->manager, u, AUDIT_SERVICE_STOP, true);
+ }
+}
+
+static bool unit_process_job(Job *j, UnitActiveState ns, UnitNotifyFlags flags) {
+ bool unexpected = false;
+ JobResult result;
+
+ assert(j);
+
+ if (j->state == JOB_WAITING)
+
+ /* So we reached a different state for this job. Let's see if we can run it now if it failed previously
+ * due to EAGAIN. */
+ job_add_to_run_queue(j);
+
+ /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and
+ * hence needs to invalidate jobs. */
+
+ switch (j->type) {
+
+ case JOB_START:
+ case JOB_VERIFY_ACTIVE:
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) {
+ unexpected = true;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ if (ns == UNIT_FAILED)
+ result = JOB_FAILED;
+ else
+ result = JOB_DONE;
+
+ job_finish_and_invalidate(j, result, true, false);
+ }
+ }
+
+ break;
+
+ case JOB_RELOAD:
+ case JOB_RELOAD_OR_START:
+ case JOB_TRY_RELOAD:
+
+ if (j->state == JOB_RUNNING) {
+ if (ns == UNIT_ACTIVE)
+ job_finish_and_invalidate(j, (flags & UNIT_NOTIFY_RELOAD_FAILURE) ? JOB_FAILED : JOB_DONE, true, false);
+ else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) {
+ unexpected = true;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+ job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false);
+ }
+ }
+
+ break;
+
+ case JOB_STOP:
+ case JOB_RESTART:
+ case JOB_TRY_RESTART:
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+ job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) {
+ unexpected = true;
+ job_finish_and_invalidate(j, JOB_FAILED, true, false);
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return unexpected;
+}
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags) {
+ const char *reason;
+ Manager *m;
+
+ assert(u);
+ assert(os < _UNIT_ACTIVE_STATE_MAX);
+ assert(ns < _UNIT_ACTIVE_STATE_MAX);
+
+ /* Note that this is called for all low-level state changes, even if they might map to the same high-level
+ * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is
+ * remounted this function will be called too! */
+
+ m = u->manager;
+
+ /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in
+ * the bus queue, so that any job change signal queued will force out the unit change signal first. */
+ unit_add_to_dbus_queue(u);
+
+ /* Update systemd-oomd on the property/state change */
+ if (os != ns) {
+ /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop
+ * monitoring.
+ * Also send an update whenever the unit goes active; this is to handle a case where an override file
+ * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to
+ * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't
+ * have the information on the property. Thus, indiscriminately send an update. */
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns) || UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ (void) manager_varlink_send_managed_oom_update(u);
+ }
+
+ /* Update timestamps for state changes */
+ if (!MANAGER_IS_RELOADING(m)) {
+ dual_timestamp_get(&u->state_change_timestamp);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns))
+ u->inactive_exit_timestamp = u->state_change_timestamp;
+ else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns))
+ u->inactive_enter_timestamp = u->state_change_timestamp;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ u->active_enter_timestamp = u->state_change_timestamp;
+ else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ u->active_exit_timestamp = u->state_change_timestamp;
+ }
+
+ /* Keep track of failed units */
+ (void) manager_update_failed_units(m, u, ns == UNIT_FAILED);
+
+ /* Make sure the cgroup and state files are always removed when we become inactive */
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ SET_FLAG(u->markers,
+ (1u << UNIT_MARKER_NEEDS_RELOAD)|(1u << UNIT_MARKER_NEEDS_RESTART),
+ false);
+ unit_prune_cgroup(u);
+ unit_unlink_state_files(u);
+ } else if (ns != os && ns == UNIT_RELOADING)
+ SET_FLAG(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD, false);
+
+ unit_update_on_console(u);
+
+ if (!MANAGER_IS_RELOADING(m)) {
+ bool unexpected;
+
+ /* Let's propagate state changes to the job */
+ if (u->job)
+ unexpected = unit_process_job(u->job, ns, flags);
+ else
+ unexpected = true;
+
+ /* If this state change happened without being requested by a job, then let's retroactively start or
+ * stop dependencies. We skip that step when deserializing, since we don't want to create any
+ * additional jobs just because something is already activated. */
+
+ if (unexpected) {
+ if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns))
+ retroactively_start_dependencies(u);
+ else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns))
+ retroactively_stop_dependencies(u);
+ }
+
+ if (ns != os && ns == UNIT_FAILED) {
+ log_unit_debug(u, "Unit entered failed state.");
+
+ if (!(flags & UNIT_NOTIFY_WILL_AUTO_RESTART))
+ unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+ }
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) {
+ /* This unit just finished starting up */
+
+ unit_emit_audit_start(u);
+ manager_send_unit_plymouth(m, u);
+ }
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) {
+ /* This unit just stopped/failed. */
+
+ unit_emit_audit_stop(u, ns);
+ unit_log_resources(u);
+ }
+
+ if (ns == UNIT_INACTIVE && !IN_SET(os, UNIT_FAILED, UNIT_INACTIVE, UNIT_MAINTENANCE) &&
+ !(flags & UNIT_NOTIFY_WILL_AUTO_RESTART))
+ unit_start_on_failure(u, "OnSuccess=", UNIT_ATOM_ON_SUCCESS, u->on_success_job_mode);
+ }
+
+ manager_recheck_journal(m);
+ manager_recheck_dbus(m);
+
+ unit_trigger_notify(u);
+
+ if (!MANAGER_IS_RELOADING(m)) {
+ if (os != UNIT_FAILED && ns == UNIT_FAILED) {
+ reason = strjoina("unit ", u->id, " failed");
+ emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason);
+ } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) {
+ reason = strjoina("unit ", u->id, " succeeded");
+ emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason);
+ }
+ }
+
+ /* And now, add the unit or depending units to various queues that will act on the new situation if
+ * needed. These queues generally check for continuous state changes rather than events (like most of
+ * the state propagation above), and do work deferred instead of instantly, since they typically
+ * don't want to run during reloading, and usually involve checking combined state of multiple units
+ * at once. */
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ /* Stop unneeded units and bound-by units regardless if going down was expected or not */
+ check_unneeded_dependencies(u);
+ check_bound_by_dependencies(u);
+
+ /* Maybe someone wants us to remain up? */
+ unit_submit_to_start_when_upheld_queue(u);
+
+ /* Maybe the unit should be GC'ed now? */
+ unit_add_to_gc_queue(u);
+ }
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) {
+ /* Start uphold units regardless if going up was expected or not */
+ check_uphold_dependencies(u);
+
+ /* Maybe we finished startup and are now ready for being stopped because unneeded? */
+ unit_submit_to_stop_when_unneeded_queue(u);
+
+ /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens
+ * when something BindsTo= to a Type=oneshot unit, as these units go directly from starting to
+ * inactive, without ever entering started.) */
+ unit_submit_to_stop_when_bound_queue(u);
+ }
+}
+
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) {
+ int r;
+
+ assert(u);
+ assert(pid_is_valid(pid));
+
+ /* Watch a specific PID */
+
+ /* Caller might be sure that this PID belongs to this unit only. Let's take this
+ * opportunity to remove any stalled references to this PID as they can be created
+ * easily (when watching a process which is not our direct child). */
+ if (exclusive)
+ manager_unwatch_pid(u->manager, pid);
+
+ r = set_ensure_allocated(&u->pids, NULL);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&u->manager->watch_pids, NULL);
+ if (r < 0)
+ return r;
+
+ /* First try, let's add the unit keyed by "pid". */
+ r = hashmap_put(u->manager->watch_pids, PID_TO_PTR(pid), u);
+ if (r == -EEXIST) {
+ Unit **array;
+ bool found = false;
+ size_t n = 0;
+
+ /* OK, the "pid" key is already assigned to a different unit. Let's see if the "-pid" key (which points
+ * to an array of Units rather than just a Unit), lists us already. */
+
+ array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid));
+ if (array)
+ for (; array[n]; n++)
+ if (array[n] == u)
+ found = true;
+
+ if (found) /* Found it already? if so, do nothing */
+ r = 0;
+ else {
+ Unit **new_array;
+
+ /* Allocate a new array */
+ new_array = new(Unit*, n + 2);
+ if (!new_array)
+ return -ENOMEM;
+
+ memcpy_safe(new_array, array, sizeof(Unit*) * n);
+ new_array[n] = u;
+ new_array[n+1] = NULL;
+
+ /* Add or replace the old array */
+ r = hashmap_replace(u->manager->watch_pids, PID_TO_PTR(-pid), new_array);
+ if (r < 0) {
+ free(new_array);
+ return r;
+ }
+
+ free(array);
+ }
+ } else if (r < 0)
+ return r;
+
+ r = set_put(u->pids, PID_TO_PTR(pid));
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+void unit_unwatch_pid(Unit *u, pid_t pid) {
+ Unit **array;
+
+ assert(u);
+ assert(pid_is_valid(pid));
+
+ /* First let's drop the unit in case it's keyed as "pid". */
+ (void) hashmap_remove_value(u->manager->watch_pids, PID_TO_PTR(pid), u);
+
+ /* Then, let's also drop the unit, in case it's in the array keyed by -pid */
+ array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid));
+ if (array) {
+ /* Let's iterate through the array, dropping our own entry */
+
+ size_t m = 0;
+ for (size_t n = 0; array[n]; n++)
+ if (array[n] != u)
+ array[m++] = array[n];
+ array[m] = NULL;
+
+ if (m == 0) {
+ /* The array is now empty, remove the entire entry */
+ assert_se(hashmap_remove(u->manager->watch_pids, PID_TO_PTR(-pid)) == array);
+ free(array);
+ }
+ }
+
+ (void) set_remove(u->pids, PID_TO_PTR(pid));
+}
+
+void unit_unwatch_all_pids(Unit *u) {
+ assert(u);
+
+ while (!set_isempty(u->pids))
+ unit_unwatch_pid(u, PTR_TO_PID(set_first(u->pids)));
+
+ u->pids = set_free(u->pids);
+}
+
+static void unit_tidy_watch_pids(Unit *u) {
+ pid_t except1, except2;
+ void *e;
+
+ assert(u);
+
+ /* Cleans dead PIDs from our list */
+
+ except1 = unit_main_pid(u);
+ except2 = unit_control_pid(u);
+
+ SET_FOREACH(e, u->pids) {
+ pid_t pid = PTR_TO_PID(e);
+
+ if (pid == except1 || pid == except2)
+ continue;
+
+ if (!pid_is_unwaited(pid))
+ unit_unwatch_pid(u, pid);
+ }
+}
+
+static int on_rewatch_pids_event(sd_event_source *s, void *userdata) {
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ unit_tidy_watch_pids(u);
+ unit_watch_all_pids(u);
+
+ /* If the PID set is empty now, then let's finish this off. */
+ unit_synthesize_cgroup_empty_event(u);
+
+ return 0;
+}
+
+int unit_enqueue_rewatch_pids(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we can use proper notifications */
+ return 0;
+
+ /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new
+ * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it
+ * involves issuing kill(pid, 0) on all processes we watch. */
+
+ if (!u->rewatch_pids_event_source) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+ r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m");
+
+ r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m");
+
+ (void) sd_event_source_set_description(s, "tidy-watch-pids");
+
+ u->rewatch_pids_event_source = TAKE_PTR(s);
+ }
+
+ r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m");
+
+ return 0;
+}
+
+void unit_dequeue_rewatch_pids(Unit *u) {
+ int r;
+ assert(u);
+
+ if (!u->rewatch_pids_event_source)
+ return;
+
+ r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m");
+
+ u->rewatch_pids_event_source = sd_event_source_disable_unref(u->rewatch_pids_event_source);
+}
+
+bool unit_job_is_applicable(Unit *u, JobType j) {
+ assert(u);
+ assert(j >= 0 && j < _JOB_TYPE_MAX);
+
+ switch (j) {
+
+ case JOB_VERIFY_ACTIVE:
+ case JOB_START:
+ case JOB_NOP:
+ /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not
+ * startable by us but may appear due to external events, and it thus makes sense to permit enqueuing
+ * jobs for it. */
+ return true;
+
+ case JOB_STOP:
+ /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to
+ * external events), hence it makes no sense to permit enqueuing such a request either. */
+ return !u->perpetual;
+
+ case JOB_RESTART:
+ case JOB_TRY_RESTART:
+ return unit_can_stop(u) && unit_can_start(u);
+
+ case JOB_RELOAD:
+ case JOB_TRY_RELOAD:
+ return unit_can_reload(u);
+
+ case JOB_RELOAD_OR_START:
+ return unit_can_reload(u) && unit_can_start(u);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_add_dependency(
+ Unit *u,
+ UnitDependency d,
+ Unit *other,
+ bool add_reference,
+ UnitDependencyMask mask) {
+
+ static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = {
+ [UNIT_REQUIRES] = UNIT_REQUIRED_BY,
+ [UNIT_REQUISITE] = UNIT_REQUISITE_OF,
+ [UNIT_WANTS] = UNIT_WANTED_BY,
+ [UNIT_BINDS_TO] = UNIT_BOUND_BY,
+ [UNIT_PART_OF] = UNIT_CONSISTS_OF,
+ [UNIT_UPHOLDS] = UNIT_UPHELD_BY,
+ [UNIT_REQUIRED_BY] = UNIT_REQUIRES,
+ [UNIT_REQUISITE_OF] = UNIT_REQUISITE,
+ [UNIT_WANTED_BY] = UNIT_WANTS,
+ [UNIT_BOUND_BY] = UNIT_BINDS_TO,
+ [UNIT_CONSISTS_OF] = UNIT_PART_OF,
+ [UNIT_UPHELD_BY] = UNIT_UPHOLDS,
+ [UNIT_CONFLICTS] = UNIT_CONFLICTED_BY,
+ [UNIT_CONFLICTED_BY] = UNIT_CONFLICTS,
+ [UNIT_BEFORE] = UNIT_AFTER,
+ [UNIT_AFTER] = UNIT_BEFORE,
+ [UNIT_ON_SUCCESS] = UNIT_ON_SUCCESS_OF,
+ [UNIT_ON_SUCCESS_OF] = UNIT_ON_SUCCESS,
+ [UNIT_ON_FAILURE] = UNIT_ON_FAILURE_OF,
+ [UNIT_ON_FAILURE_OF] = UNIT_ON_FAILURE,
+ [UNIT_TRIGGERS] = UNIT_TRIGGERED_BY,
+ [UNIT_TRIGGERED_BY] = UNIT_TRIGGERS,
+ [UNIT_PROPAGATES_RELOAD_TO] = UNIT_RELOAD_PROPAGATED_FROM,
+ [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO,
+ [UNIT_PROPAGATES_STOP_TO] = UNIT_STOP_PROPAGATED_FROM,
+ [UNIT_STOP_PROPAGATED_FROM] = UNIT_PROPAGATES_STOP_TO,
+ [UNIT_JOINS_NAMESPACE_OF] = UNIT_JOINS_NAMESPACE_OF, /* symmetric! 👓 */
+ [UNIT_REFERENCES] = UNIT_REFERENCED_BY,
+ [UNIT_REFERENCED_BY] = UNIT_REFERENCES,
+ [UNIT_IN_SLICE] = UNIT_SLICE_OF,
+ [UNIT_SLICE_OF] = UNIT_IN_SLICE,
+ };
+ UnitDependencyAtom a;
+ int r;
+
+ /* Helper to know whether sending a notification is necessary or not: if the dependency is already
+ * there, no need to notify! */
+ bool notify, notify_other = false;
+
+ assert(u);
+ assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+ assert(other);
+
+ u = unit_follow_merge(u);
+ other = unit_follow_merge(other);
+ a = unit_dependency_to_atom(d);
+ assert(a >= 0);
+
+ /* We won't allow dependencies on ourselves. We will not consider them an error however. */
+ if (u == other) {
+ if (unit_should_warn_about_dependency(d))
+ log_unit_warning(u, "Dependency %s=%s is dropped.",
+ unit_dependency_to_string(d), u->id);
+ return 0;
+ }
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ /* Note that ordering a device unit after a unit is permitted since it allows to start its job
+ * running timeout at a specific time. */
+ if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) {
+ log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id);
+ return 0;
+ }
+
+ if (FLAGS_SET(a, UNIT_ATOM_ON_FAILURE) && !UNIT_VTABLE(u)->can_fail) {
+ log_unit_warning(u, "Requested dependency OnFailure=%s ignored (%s units cannot fail).", other->id, unit_type_to_string(u->type));
+ return 0;
+ }
+
+ if (FLAGS_SET(a, UNIT_ATOM_TRIGGERS) && !UNIT_VTABLE(u)->can_trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Triggers=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(u->type));
+ if (FLAGS_SET(a, UNIT_ATOM_TRIGGERED_BY) && !UNIT_VTABLE(other)->can_trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency TriggeredBy=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(other->type));
+
+ if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && other->type != UNIT_SLICE)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Slice=%s refused (%s is not a slice unit).", other->id, other->id);
+ if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && u->type != UNIT_SLICE)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency SliceOf=%s refused (%s is not a slice unit).", other->id, u->id);
+
+ if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && !UNIT_HAS_CGROUP_CONTEXT(u))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Slice=%s refused (%s is not a cgroup unit).", other->id, u->id);
+
+ if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && !UNIT_HAS_CGROUP_CONTEXT(other))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency SliceOf=%s refused (%s is not a cgroup unit).", other->id, other->id);
+
+ r = unit_add_dependency_hashmap(&u->dependencies, d, other, mask, 0);
+ if (r < 0)
+ return r;
+ notify = r > 0;
+
+ if (inverse_table[d] != _UNIT_DEPENDENCY_INVALID && inverse_table[d] != d) {
+ r = unit_add_dependency_hashmap(&other->dependencies, inverse_table[d], u, 0, mask);
+ if (r < 0)
+ return r;
+ notify_other = r > 0;
+ }
+
+ if (add_reference) {
+ r = unit_add_dependency_hashmap(&u->dependencies, UNIT_REFERENCES, other, mask, 0);
+ if (r < 0)
+ return r;
+ notify = notify || r > 0;
+
+ r = unit_add_dependency_hashmap(&other->dependencies, UNIT_REFERENCED_BY, u, 0, mask);
+ if (r < 0)
+ return r;
+ notify_other = notify_other || r > 0;
+ }
+
+ if (notify)
+ unit_add_to_dbus_queue(u);
+ if (notify_other)
+ unit_add_to_dbus_queue(other);
+
+ return notify || notify_other;
+}
+
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) {
+ int r, s;
+
+ assert(u);
+
+ r = unit_add_dependency(u, d, other, add_reference, mask);
+ if (r < 0)
+ return r;
+
+ s = unit_add_dependency(u, e, other, add_reference, mask);
+ if (s < 0)
+ return s;
+
+ return r > 0 || s > 0;
+}
+
+static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) {
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(buf);
+ assert(ret);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ *buf = NULL;
+ *ret = name;
+ return 0;
+ }
+
+ if (u->instance)
+ r = unit_name_replace_instance(name, u->instance, buf);
+ else {
+ _cleanup_free_ char *i = NULL;
+
+ r = unit_name_to_prefix(u->id, &i);
+ if (r < 0)
+ return r;
+
+ r = unit_name_replace_instance(name, i, buf);
+ }
+ if (r < 0)
+ return r;
+
+ *ret = *buf;
+ return 0;
+}
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) {
+ _cleanup_free_ char *buf = NULL;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ r = resolve_template(u, name, &buf, &name);
+ if (r < 0)
+ return r;
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+ if (r < 0)
+ return r;
+
+ return unit_add_dependency(u, d, other, add_reference, mask);
+}
+
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) {
+ _cleanup_free_ char *buf = NULL;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ r = resolve_template(u, name, &buf, &name);
+ if (r < 0)
+ return r;
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(u, d, e, other, add_reference, mask);
+}
+
+int set_unit_path(const char *p) {
+ /* This is mostly for debug purposes */
+ return RET_NERRNO(setenv("SYSTEMD_UNIT_PATH", p, 1));
+}
+
+char *unit_dbus_path(Unit *u) {
+ assert(u);
+
+ if (!u->id)
+ return NULL;
+
+ return unit_dbus_path_from_name(u->id);
+}
+
+char *unit_dbus_path_invocation_id(Unit *u) {
+ assert(u);
+
+ if (sd_id128_is_null(u->invocation_id))
+ return NULL;
+
+ return unit_dbus_path_from_name(u->invocation_id_string);
+}
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id) {
+ int r;
+
+ assert(u);
+
+ /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */
+
+ if (sd_id128_equal(u->invocation_id, id))
+ return 0;
+
+ if (!sd_id128_is_null(u->invocation_id))
+ (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+ if (sd_id128_is_null(id)) {
+ r = 0;
+ goto reset;
+ }
+
+ r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops);
+ if (r < 0)
+ goto reset;
+
+ u->invocation_id = id;
+ sd_id128_to_string(id, u->invocation_id_string);
+
+ r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u);
+ if (r < 0)
+ goto reset;
+
+ return 0;
+
+reset:
+ u->invocation_id = SD_ID128_NULL;
+ u->invocation_id_string[0] = 0;
+ return r;
+}
+
+int unit_set_slice(Unit *u, Unit *slice) {
+ int r;
+
+ assert(u);
+ assert(slice);
+
+ /* Sets the unit slice if it has not been set before. Is extra careful, to only allow this for units
+ * that actually have a cgroup context. Also, we don't allow to set this for slices (since the parent
+ * slice is derived from the name). Make sure the unit we set is actually a slice. */
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EOPNOTSUPP;
+
+ if (u->type == UNIT_SLICE)
+ return -EINVAL;
+
+ if (unit_active_state(u) != UNIT_INACTIVE)
+ return -EBUSY;
+
+ if (slice->type != UNIT_SLICE)
+ return -EINVAL;
+
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE) &&
+ !unit_has_name(slice, SPECIAL_ROOT_SLICE))
+ return -EPERM;
+
+ if (UNIT_GET_SLICE(u) == slice)
+ return 0;
+
+ /* Disallow slice changes if @u is already bound to cgroups */
+ if (UNIT_GET_SLICE(u) && u->cgroup_realized)
+ return -EBUSY;
+
+ /* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */
+ if (UNIT_GET_SLICE(u))
+ unit_remove_dependencies(u, UNIT_DEPENDENCY_SLICE_PROPERTY);
+
+ r = unit_add_dependency(u, UNIT_IN_SLICE, slice, true, UNIT_DEPENDENCY_SLICE_PROPERTY);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int unit_set_default_slice(Unit *u) {
+ const char *slice_name;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ if (UNIT_GET_SLICE(u))
+ return 0;
+
+ if (u->instance) {
+ _cleanup_free_ char *prefix = NULL, *escaped = NULL;
+
+ /* Implicitly place all instantiated units in their
+ * own per-template slice */
+
+ r = unit_name_to_prefix(u->id, &prefix);
+ if (r < 0)
+ return r;
+
+ /* The prefix is already escaped, but it might include
+ * "-" which has a special meaning for slice units,
+ * hence escape it here extra. */
+ escaped = unit_name_escape(prefix);
+ if (!escaped)
+ return -ENOMEM;
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ slice_name = strjoina("system-", escaped, ".slice");
+ else
+ slice_name = strjoina("app-", escaped, ".slice");
+
+ } else if (unit_is_extrinsic(u))
+ /* Keep all extrinsic units (e.g. perpetual units and swap and mount units in user mode) in
+ * the root slice. They don't really belong in one of the subslices. */
+ slice_name = SPECIAL_ROOT_SLICE;
+
+ else if (MANAGER_IS_SYSTEM(u->manager))
+ slice_name = SPECIAL_SYSTEM_SLICE;
+ else
+ slice_name = SPECIAL_APP_SLICE;
+
+ r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice);
+ if (r < 0)
+ return r;
+
+ return unit_set_slice(u, slice);
+}
+
+const char *unit_slice_name(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ slice = UNIT_GET_SLICE(u);
+ if (!slice)
+ return NULL;
+
+ return slice->id;
+}
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found) {
+ _cleanup_free_ char *t = NULL;
+ int r;
+
+ assert(u);
+ assert(type);
+ assert(_found);
+
+ r = unit_name_change_suffix(u->id, type, &t);
+ if (r < 0)
+ return r;
+ if (unit_has_name(u, t))
+ return -EINVAL;
+
+ r = manager_load_unit(u->manager, t, NULL, NULL, _found);
+ assert(r < 0 || *_found != u);
+ return r;
+}
+
+static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const char *new_owner;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "sss", NULL, NULL, &new_owner);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ if (UNIT_VTABLE(u)->bus_name_owner_change)
+ UNIT_VTABLE(u)->bus_name_owner_change(u, empty_to_null(new_owner));
+
+ return 0;
+}
+
+static int get_name_owner_handler(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const sd_bus_error *e;
+ const char *new_owner;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+
+ e = sd_bus_message_get_error(message);
+ if (e) {
+ if (!sd_bus_error_has_name(e, "org.freedesktop.DBus.Error.NameHasNoOwner")) {
+ r = sd_bus_error_get_errno(e);
+ log_unit_error_errno(u, r,
+ "Unexpected error response from GetNameOwner(): %s",
+ bus_error_message(e, r));
+ }
+
+ new_owner = NULL;
+ } else {
+ r = sd_bus_message_read(message, "s", &new_owner);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ assert(!isempty(new_owner));
+ }
+
+ if (UNIT_VTABLE(u)->bus_name_owner_change)
+ UNIT_VTABLE(u)->bus_name_owner_change(u, new_owner);
+
+ return 0;
+}
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) {
+ const char *match;
+ int r;
+
+ assert(u);
+ assert(bus);
+ assert(name);
+
+ if (u->match_bus_slot || u->get_name_owner_slot)
+ return -EBUSY;
+
+ match = strjoina("type='signal',"
+ "sender='org.freedesktop.DBus',"
+ "path='/org/freedesktop/DBus',"
+ "interface='org.freedesktop.DBus',"
+ "member='NameOwnerChanged',"
+ "arg0='", name, "'");
+
+ r = sd_bus_add_match_async(bus, &u->match_bus_slot, match, signal_name_owner_changed, NULL, u);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_call_method_async(
+ bus,
+ &u->get_name_owner_slot,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.DBus",
+ "GetNameOwner",
+ get_name_owner_handler,
+ u,
+ "s", name);
+ if (r < 0) {
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ return r;
+ }
+
+ log_unit_debug(u, "Watching D-Bus name '%s'.", name);
+ return 0;
+}
+
+int unit_watch_bus_name(Unit *u, const char *name) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ /* Watch a specific name on the bus. We only support one unit
+ * watching each name for now. */
+
+ if (u->manager->api_bus) {
+ /* If the bus is already available, install the match directly.
+ * Otherwise, just put the name in the list. bus_setup_api() will take care later. */
+ r = unit_install_bus_match(u, u->manager->api_bus, name);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+ }
+
+ r = hashmap_put(u->manager->watch_bus, name, u);
+ if (r < 0) {
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+ return log_warning_errno(r, "Failed to put bus name to hashmap: %m");
+ }
+
+ return 0;
+}
+
+void unit_unwatch_bus_name(Unit *u, const char *name) {
+ assert(u);
+ assert(name);
+
+ (void) hashmap_remove_value(u->manager->watch_bus, name, u);
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+}
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency dep, UnitDependencyMask mask) {
+ _cleanup_free_ char *e = NULL;
+ Unit *device;
+ int r;
+
+ assert(u);
+
+ /* Adds in links to the device node that this unit is based on */
+ if (isempty(what))
+ return 0;
+
+ if (!is_device_path(what))
+ return 0;
+
+ /* When device units aren't supported (such as in a container), don't create dependencies on them. */
+ if (!unit_type_supported(UNIT_DEVICE))
+ return 0;
+
+ r = unit_name_from_path(what, ".device", &e);
+ if (r < 0)
+ return r;
+
+ r = manager_load_unit(u->manager, e, NULL, NULL, &device);
+ if (r < 0)
+ return r;
+
+ if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u))
+ dep = UNIT_BINDS_TO;
+
+ return unit_add_two_dependencies(u, UNIT_AFTER,
+ MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS,
+ device, true, mask);
+}
+
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask) {
+ _cleanup_free_ char *escaped = NULL, *target = NULL;
+ int r;
+
+ assert(u);
+
+ if (isempty(what))
+ return 0;
+
+ if (!path_startswith(what, "/dev/"))
+ return 0;
+
+ /* If we don't support devices, then also don't bother with blockdev@.target */
+ if (!unit_type_supported(UNIT_DEVICE))
+ return 0;
+
+ r = unit_name_path_escape(what, &escaped);
+ if (r < 0)
+ return r;
+
+ r = unit_name_build("blockdev", escaped, ".target", &target);
+ if (r < 0)
+ return r;
+
+ return unit_add_dependency_by_name(u, UNIT_AFTER, target, true, mask);
+}
+
+int unit_coldplug(Unit *u) {
+ int r = 0, q;
+
+ assert(u);
+
+ /* Make sure we don't enter a loop, when coldplugging recursively. */
+ if (u->coldplugged)
+ return 0;
+
+ u->coldplugged = true;
+
+ STRV_FOREACH(i, u->deserialized_refs) {
+ q = bus_unit_track_add_name(u, *i);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+ u->deserialized_refs = strv_free(u->deserialized_refs);
+
+ if (UNIT_VTABLE(u)->coldplug) {
+ q = UNIT_VTABLE(u)->coldplug(u);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+
+ if (u->job) {
+ q = job_coldplug(u->job);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+ if (u->nop_job) {
+ q = job_coldplug(u->nop_job);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+
+ return r;
+}
+
+void unit_catchup(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->catchup)
+ UNIT_VTABLE(u)->catchup(u);
+
+ unit_cgroup_catchup(u);
+}
+
+static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) {
+ struct stat st;
+
+ if (!path)
+ return false;
+
+ /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we
+ * are never out-of-date. */
+ if (PATH_STARTSWITH_SET(path, "/proc", "/sys"))
+ return false;
+
+ if (stat(path, &st) < 0)
+ /* What, cannot access this anymore? */
+ return true;
+
+ if (path_masked)
+ /* For masked files check if they are still so */
+ return !null_or_empty(&st);
+ else
+ /* For non-empty files check the mtime */
+ return timespec_load(&st.st_mtim) > mtime;
+
+ return false;
+}
+
+bool unit_need_daemon_reload(Unit *u) {
+ _cleanup_strv_free_ char **t = NULL;
+
+ assert(u);
+
+ /* For unit files, we allow masking… */
+ if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime,
+ u->load_state == UNIT_MASKED))
+ return true;
+
+ /* Source paths should not be masked… */
+ if (fragment_mtime_newer(u->source_path, u->source_mtime, false))
+ return true;
+
+ if (u->load_state == UNIT_LOADED)
+ (void) unit_find_dropin_paths(u, &t);
+ if (!strv_equal(u->dropin_paths, t))
+ return true;
+
+ /* … any drop-ins that are masked are simply omitted from the list. */
+ STRV_FOREACH(path, u->dropin_paths)
+ if (fragment_mtime_newer(*path, u->dropin_mtime, false))
+ return true;
+
+ return false;
+}
+
+void unit_reset_failed(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->reset_failed)
+ UNIT_VTABLE(u)->reset_failed(u);
+
+ ratelimit_reset(&u->start_ratelimit);
+ u->start_limit_hit = false;
+}
+
+Unit *unit_following(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->following)
+ return UNIT_VTABLE(u)->following(u);
+
+ return NULL;
+}
+
+bool unit_stop_pending(Unit *u) {
+ assert(u);
+
+ /* This call does check the current state of the unit. It's
+ * hence useful to be called from state change calls of the
+ * unit itself, where the state isn't updated yet. This is
+ * different from unit_inactive_or_pending() which checks both
+ * the current state and for a queued job. */
+
+ return unit_has_job_type(u, JOB_STOP);
+}
+
+bool unit_inactive_or_pending(Unit *u) {
+ assert(u);
+
+ /* Returns true if the unit is inactive or going down */
+
+ if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)))
+ return true;
+
+ if (unit_stop_pending(u))
+ return true;
+
+ return false;
+}
+
+bool unit_active_or_pending(Unit *u) {
+ assert(u);
+
+ /* Returns true if the unit is active or going up */
+
+ if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)))
+ return true;
+
+ if (u->job &&
+ IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART))
+ return true;
+
+ return false;
+}
+
+bool unit_will_restart_default(Unit *u) {
+ assert(u);
+
+ return unit_has_job_type(u, JOB_START);
+}
+
+bool unit_will_restart(Unit *u) {
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->will_restart)
+ return false;
+
+ return UNIT_VTABLE(u)->will_restart(u);
+}
+
+int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error) {
+ assert(u);
+ assert(w >= 0 && w < _KILL_WHO_MAX);
+ assert(SIGNAL_VALID(signo));
+
+ if (!UNIT_VTABLE(u)->kill)
+ return -EOPNOTSUPP;
+
+ return UNIT_VTABLE(u)->kill(u, w, signo, error);
+}
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->notify_cgroup_oom)
+ UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom);
+}
+
+static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+ int r;
+
+ pid_set = set_new(NULL);
+ if (!pid_set)
+ return NULL;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ if (main_pid > 0) {
+ r = set_put(pid_set, PID_TO_PTR(main_pid));
+ if (r < 0)
+ return NULL;
+ }
+
+ if (control_pid > 0) {
+ r = set_put(pid_set, PID_TO_PTR(control_pid));
+ if (r < 0)
+ return NULL;
+ }
+
+ return TAKE_PTR(pid_set);
+}
+
+static int kill_common_log(pid_t pid, int signo, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+
+ (void) get_process_comm(pid, &comm);
+ log_unit_info(u, "Sending signal SIG%s to process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), pid, strna(comm));
+
+ return 1;
+}
+
+int unit_kill_common(
+ Unit *u,
+ KillWho who,
+ int signo,
+ pid_t main_pid,
+ pid_t control_pid,
+ sd_bus_error *error) {
+
+ int r = 0;
+ bool killed = false;
+
+ /* This is the common implementation for explicit user-requested killing of unit processes, shared by
+ * various unit types. Do not confuse with unit_kill_context(), which is what we use when we want to
+ * stop a service ourselves. */
+
+ if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) {
+ if (main_pid < 0)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type));
+ if (main_pid == 0)
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill");
+ }
+
+ if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) {
+ if (control_pid < 0)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type));
+ if (control_pid == 0)
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill");
+ }
+
+ if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL))
+ if (control_pid > 0) {
+ _cleanup_free_ char *comm = NULL;
+ (void) get_process_comm(control_pid, &comm);
+
+ if (kill(control_pid, signo) < 0) {
+ /* Report this failure both to the logs and to the client */
+ sd_bus_error_set_errnof(
+ error, errno,
+ "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m",
+ signal_to_string(signo), control_pid, strna(comm));
+ r = log_unit_warning_errno(
+ u, errno,
+ "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m",
+ signal_to_string(signo), control_pid, strna(comm));
+ } else {
+ log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), control_pid, strna(comm));
+ killed = true;
+ }
+ }
+
+ if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL))
+ if (main_pid > 0) {
+ _cleanup_free_ char *comm = NULL;
+ (void) get_process_comm(main_pid, &comm);
+
+ if (kill(main_pid, signo) < 0) {
+ if (r == 0)
+ sd_bus_error_set_errnof(
+ error, errno,
+ "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m",
+ signal_to_string(signo), main_pid, strna(comm));
+
+ r = log_unit_warning_errno(
+ u, errno,
+ "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m",
+ signal_to_string(signo), main_pid, strna(comm));
+ } else {
+ log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), main_pid, strna(comm));
+ killed = true;
+ }
+ }
+
+ if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+ int q;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ pid_set = unit_pid_set(main_pid, control_pid);
+ if (!pid_set)
+ return log_oom();
+
+ q = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, signo, 0, pid_set, kill_common_log, u);
+ if (q < 0) {
+ if (!IN_SET(q, -ESRCH, -ENOENT)) {
+ if (r == 0)
+ sd_bus_error_set_errnof(
+ error, q,
+ "Failed to send signal SIG%s to auxiliary processes: %m",
+ signal_to_string(signo));
+
+ r = log_unit_warning_errno(
+ u, q,
+ "Failed to send signal SIG%s to auxiliary processes on client request: %m",
+ signal_to_string(signo));
+ }
+ } else
+ killed = true;
+ }
+
+ /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */
+ if (r == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL))
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill");
+
+ return r;
+}
+
+int unit_following_set(Unit *u, Set **s) {
+ assert(u);
+ assert(s);
+
+ if (UNIT_VTABLE(u)->following_set)
+ return UNIT_VTABLE(u)->following_set(u, s);
+
+ *s = NULL;
+ return 0;
+}
+
+UnitFileState unit_get_unit_file_state(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->unit_file_state < 0 && u->fragment_path) {
+ r = unit_file_get_state(
+ u->manager->unit_file_scope,
+ NULL,
+ u->id,
+ &u->unit_file_state);
+ if (r < 0)
+ u->unit_file_state = UNIT_FILE_BAD;
+ }
+
+ return u->unit_file_state;
+}
+
+int unit_get_unit_file_preset(Unit *u) {
+ assert(u);
+
+ if (u->unit_file_preset < 0 && u->fragment_path)
+ u->unit_file_preset = unit_file_query_preset(
+ u->manager->unit_file_scope,
+ NULL,
+ basename(u->fragment_path),
+ NULL);
+
+ return u->unit_file_preset;
+}
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) {
+ assert(ref);
+ assert(source);
+ assert(target);
+
+ if (ref->target)
+ unit_ref_unset(ref);
+
+ ref->source = source;
+ ref->target = target;
+ LIST_PREPEND(refs_by_target, target->refs_by_target, ref);
+ return target;
+}
+
+void unit_ref_unset(UnitRef *ref) {
+ assert(ref);
+
+ if (!ref->target)
+ return;
+
+ /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might
+ * be unreferenced now. */
+ unit_add_to_gc_queue(ref->target);
+
+ LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref);
+ ref->source = ref->target = NULL;
+}
+
+static int user_from_unit_name(Unit *u, char **ret) {
+
+ static const uint8_t hash_key[] = {
+ 0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96,
+ 0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec
+ };
+
+ _cleanup_free_ char *n = NULL;
+ int r;
+
+ r = unit_name_to_prefix(u->id, &n);
+ if (r < 0)
+ return r;
+
+ if (valid_user_group_name(n, 0)) {
+ *ret = TAKE_PTR(n);
+ return 0;
+ }
+
+ /* If we can't use the unit name as a user name, then let's hash it and use that */
+ if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int unit_patch_contexts(Unit *u) {
+ CGroupContext *cc;
+ ExecContext *ec;
+ int r;
+
+ assert(u);
+
+ /* Patch in the manager defaults into the exec and cgroup
+ * contexts, _after_ the rest of the settings have been
+ * initialized */
+
+ ec = unit_get_exec_context(u);
+ if (ec) {
+ /* This only copies in the ones that need memory */
+ for (unsigned i = 0; i < _RLIMIT_MAX; i++)
+ if (u->manager->rlimit[i] && !ec->rlimit[i]) {
+ ec->rlimit[i] = newdup(struct rlimit, u->manager->rlimit[i], 1);
+ if (!ec->rlimit[i])
+ return -ENOMEM;
+ }
+
+ if (MANAGER_IS_USER(u->manager) &&
+ !ec->working_directory) {
+
+ r = get_home_dir(&ec->working_directory);
+ if (r < 0)
+ return r;
+
+ /* Allow user services to run, even if the
+ * home directory is missing */
+ ec->working_directory_missing_ok = true;
+ }
+
+ if (ec->private_devices)
+ ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
+
+ if (ec->protect_kernel_modules)
+ ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
+
+ if (ec->protect_kernel_logs)
+ ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG);
+
+ if (ec->protect_clock)
+ ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | (UINT64_C(1) << CAP_WAKE_ALARM));
+
+ if (ec->dynamic_user) {
+ if (!ec->user) {
+ r = user_from_unit_name(u, &ec->user);
+ if (r < 0)
+ return r;
+ }
+
+ if (!ec->group) {
+ ec->group = strdup(ec->user);
+ if (!ec->group)
+ return -ENOMEM;
+ }
+
+ /* If the dynamic user option is on, let's make sure that the unit can't leave its
+ * UID/GID around in the file system or on IPC objects. Hence enforce a strict
+ * sandbox. */
+
+ ec->private_tmp = true;
+ ec->remove_ipc = true;
+ ec->protect_system = PROTECT_SYSTEM_STRICT;
+ if (ec->protect_home == PROTECT_HOME_NO)
+ ec->protect_home = PROTECT_HOME_READ_ONLY;
+
+ /* Make sure this service can neither benefit from SUID/SGID binaries nor create
+ * them. */
+ ec->no_new_privileges = true;
+ ec->restrict_suid_sgid = true;
+ }
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+ exec_directory_sort(ec->directories + dt);
+ }
+
+ cc = unit_get_cgroup_context(u);
+ if (cc && ec) {
+
+ if (ec->private_devices &&
+ cc->device_policy == CGROUP_DEVICE_POLICY_AUTO)
+ cc->device_policy = CGROUP_DEVICE_POLICY_CLOSED;
+
+ /* Only add these if needed, as they imply that everything else is blocked. */
+ if (cc->device_policy != CGROUP_DEVICE_POLICY_AUTO || cc->device_allow) {
+ if (ec->root_image || ec->mount_images) {
+
+ /* When RootImage= or MountImages= is specified, the following devices are touched. */
+ FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") {
+ r = cgroup_add_device_allow(cc, p, "rw");
+ if (r < 0)
+ return r;
+ }
+ FOREACH_STRING(p, "block-loop", "block-blkext", "block-device-mapper") {
+ r = cgroup_add_device_allow(cc, p, "rwm");
+ if (r < 0)
+ return r;
+ }
+
+ /* Make sure "block-loop" can be resolved, i.e. make sure "loop" shows up in /proc/devices.
+ * Same for mapper and verity. */
+ FOREACH_STRING(p, "modprobe@loop.service", "modprobe@dm_mod.service", "modprobe@dm_verity.service") {
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, p, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (ec->protect_clock) {
+ r = cgroup_add_device_allow(cc, "char-rtc", "r");
+ if (r < 0)
+ return r;
+ }
+
+ /* If there are encrypted credentials we might need to access the TPM. */
+ bool allow_tpm = false;
+ ExecLoadCredential *load_cred;
+ ExecSetCredential *set_cred;
+ HASHMAP_FOREACH(load_cred, ec->load_credentials)
+ if ((allow_tpm |= load_cred->encrypted))
+ break;
+ HASHMAP_FOREACH(set_cred, ec->set_credentials)
+ if ((allow_tpm |= set_cred->encrypted))
+ break;
+
+ if (allow_tpm) {
+ r = cgroup_add_device_allow(cc, "/dev/tpmrm0", "rw");
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+ExecContext *unit_get_exec_context(const Unit *u) {
+ size_t offset;
+ assert(u);
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->exec_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (ExecContext*) ((uint8_t*) u + offset);
+}
+
+KillContext *unit_get_kill_context(Unit *u) {
+ size_t offset;
+ assert(u);
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->kill_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (KillContext*) ((uint8_t*) u + offset);
+}
+
+CGroupContext *unit_get_cgroup_context(Unit *u) {
+ size_t offset;
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->cgroup_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (CGroupContext*) ((uint8_t*) u + offset);
+}
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) {
+ size_t offset;
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->exec_runtime_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return *(ExecRuntime**) ((uint8_t*) u + offset);
+}
+
+static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) {
+ assert(u);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return NULL;
+
+ if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */
+ return u->manager->lookup_paths.transient;
+
+ if (flags & UNIT_PERSISTENT)
+ return u->manager->lookup_paths.persistent_control;
+
+ if (flags & UNIT_RUNTIME)
+ return u->manager->lookup_paths.runtime_control;
+
+ return NULL;
+}
+
+char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) {
+ assert(!FLAGS_SET(flags, UNIT_ESCAPE_EXEC_SYNTAX | UNIT_ESCAPE_C));
+
+ _cleanup_free_ char *t = NULL;
+
+ if (!s)
+ return NULL;
+
+ /* Escapes the input string as requested. Returns the escaped string. If 'buf' is specified then the
+ * allocated return buffer pointer is also written to *buf, except if no escaping was necessary, in
+ * which case *buf is set to NULL, and the input pointer is returned as-is. This means the return
+ * value always contains a properly escaped version, but *buf when passed only contains a pointer if
+ * an allocation was necessary. If *buf is not specified, then the return value always needs to be
+ * freed. Callers can use this to optimize memory allocations. */
+
+ if (flags & UNIT_ESCAPE_SPECIFIERS) {
+ t = specifier_escape(s);
+ if (!t)
+ return NULL;
+
+ s = t;
+ }
+
+ /* We either do c-escaping or shell-escaping, to additionally escape characters that we parse for
+ * ExecStart= and friend, i.e. '$' and ';' and quotes. */
+
+ if (flags & UNIT_ESCAPE_EXEC_SYNTAX) {
+ char *t2 = shell_escape(s, "$;'\"");
+ if (!t2)
+ return NULL;
+ free_and_replace(t, t2);
+
+ s = t;
+
+ } else if (flags & UNIT_ESCAPE_C) {
+ char *t2 = cescape(s);
+ if (!t2)
+ return NULL;
+ free_and_replace(t, t2);
+
+ s = t;
+ }
+
+ if (buf) {
+ *buf = TAKE_PTR(t);
+ return (char*) s;
+ }
+
+ return TAKE_PTR(t) ?: strdup(s);
+}
+
+char* unit_concat_strv(char **l, UnitWriteFlags flags) {
+ _cleanup_free_ char *result = NULL;
+ size_t n = 0;
+
+ /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command
+ * lines in a way suitable for ExecStart= stanzas. */
+
+ STRV_FOREACH(i, l) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ size_t a;
+ char *q;
+
+ p = unit_escape_setting(*i, flags, &buf);
+ if (!p)
+ return NULL;
+
+ a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */
+ if (!GREEDY_REALLOC(result, n + a + 1))
+ return NULL;
+
+ q = result + n;
+ if (n > 0)
+ *(q++) = ' ';
+
+ *(q++) = '"';
+ q = stpcpy(q, p);
+ *(q++) = '"';
+
+ n += a;
+ }
+
+ if (!GREEDY_REALLOC(result, n + 1))
+ return NULL;
+
+ result[n] = 0;
+
+ return TAKE_PTR(result);
+}
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) {
+ _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL;
+ const char *dir, *wrapped;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(data);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return 0;
+
+ data = unit_escape_setting(data, flags, &escaped);
+ if (!data)
+ return -ENOMEM;
+
+ /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the
+ * previous section header is the same */
+
+ if (flags & UNIT_PRIVATE) {
+ if (!UNIT_VTABLE(u)->private_section)
+ return -EINVAL;
+
+ if (!u->transient_file || u->last_section_private < 0)
+ data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data);
+ else if (u->last_section_private == 0)
+ data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data);
+ } else {
+ if (!u->transient_file || u->last_section_private < 0)
+ data = strjoina("[Unit]\n", data);
+ else if (u->last_section_private > 0)
+ data = strjoina("\n[Unit]\n", data);
+ }
+
+ if (u->transient_file) {
+ /* When this is a transient unit file in creation, then let's not create a new drop-in but instead
+ * write to the transient unit file. */
+ fputs(data, u->transient_file);
+
+ if (!endswith(data, "\n"))
+ fputc('\n', u->transient_file);
+
+ /* Remember which section we wrote this entry to */
+ u->last_section_private = !!(flags & UNIT_PRIVATE);
+ return 0;
+ }
+
+ dir = unit_drop_in_dir(u, flags);
+ if (!dir)
+ return -EINVAL;
+
+ wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n"
+ "# or an equivalent operation. Do not edit.\n",
+ data,
+ "\n");
+
+ r = drop_in_file(dir, u->id, 50, name, &p, &q);
+ if (r < 0)
+ return r;
+
+ (void) mkdir_p_label(p, 0755);
+
+ /* Make sure the drop-in dir is registered in our path cache. This way we don't need to stupidly
+ * recreate the cache after every drop-in we write. */
+ if (u->manager->unit_path_cache) {
+ r = set_put_strdup(&u->manager->unit_path_cache, p);
+ if (r < 0)
+ return r;
+ }
+
+ r = write_string_file_atomic_label(q, wrapped);
+ if (r < 0)
+ return r;
+
+ r = strv_push(&u->dropin_paths, q);
+ if (r < 0)
+ return r;
+ q = NULL;
+
+ strv_uniq(u->dropin_paths);
+
+ u->dropin_mtime = now(CLOCK_REALTIME);
+
+ return 0;
+}
+
+int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) {
+ _cleanup_free_ char *p = NULL;
+ va_list ap;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(format);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return 0;
+
+ va_start(ap, format);
+ r = vasprintf(&p, format, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ return unit_write_setting(u, flags, name, p);
+}
+
+int unit_make_transient(Unit *u) {
+ _cleanup_free_ char *path = NULL;
+ FILE *f;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_transient)
+ return -EOPNOTSUPP;
+
+ (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755);
+
+ path = path_join(u->manager->lookup_paths.transient, u->id);
+ if (!path)
+ return -ENOMEM;
+
+ /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are
+ * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */
+
+ RUN_WITH_UMASK(0022) {
+ f = fopen(path, "we");
+ if (!f)
+ return -errno;
+ }
+
+ safe_fclose(u->transient_file);
+ u->transient_file = f;
+
+ free_and_replace(u->fragment_path, path);
+
+ u->source_path = mfree(u->source_path);
+ u->dropin_paths = strv_free(u->dropin_paths);
+ u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0;
+
+ u->load_state = UNIT_STUB;
+ u->load_error = 0;
+ u->transient = true;
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_gc_queue(u);
+
+ fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n",
+ u->transient_file);
+
+ return 0;
+}
+
+static int log_kill(pid_t pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) get_process_comm(pid, &comm);
+
+ /* Don't log about processes marked with brackets, under the assumption that these are temporary processes
+ only, like for example systemd's own PAM stub process. */
+ if (comm && comm[0] == '(')
+ /* Although we didn't log anything, as this callback is used in unit_kill_context we must return 1
+ * here to let the manager know that a process was killed. */
+ return 1;
+
+ log_unit_notice(userdata,
+ "Killing process " PID_FMT " (%s) with signal SIG%s.",
+ pid,
+ strna(comm),
+ signal_to_string(sig));
+
+ return 1;
+}
+
+static int operation_to_signal(const KillContext *c, KillOperation k, bool *noteworthy) {
+ assert(c);
+
+ switch (k) {
+
+ case KILL_TERMINATE:
+ case KILL_TERMINATE_AND_LOG:
+ *noteworthy = false;
+ return c->kill_signal;
+
+ case KILL_RESTART:
+ *noteworthy = false;
+ return restart_kill_signal(c);
+
+ case KILL_KILL:
+ *noteworthy = true;
+ return c->final_kill_signal;
+
+ case KILL_WATCHDOG:
+ *noteworthy = true;
+ return c->watchdog_signal;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_kill_context(
+ Unit *u,
+ KillContext *c,
+ KillOperation k,
+ pid_t main_pid,
+ pid_t control_pid,
+ bool main_pid_alien) {
+
+ bool wait_for_exit = false, send_sighup;
+ cg_kill_log_func_t log_func = NULL;
+ int sig, r;
+
+ assert(u);
+ assert(c);
+
+ /* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0
+ * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common()
+ * which is used for user-requested killing of unit processes. */
+
+ if (c->kill_mode == KILL_NONE)
+ return 0;
+
+ bool noteworthy;
+ sig = operation_to_signal(c, k, &noteworthy);
+ if (noteworthy)
+ log_func = log_kill;
+
+ send_sighup =
+ c->send_sighup &&
+ IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) &&
+ sig != SIGHUP;
+
+ if (main_pid > 0) {
+ if (log_func)
+ log_func(main_pid, sig, u);
+
+ r = kill_and_sigcont(main_pid, sig);
+ if (r < 0 && r != -ESRCH) {
+ _cleanup_free_ char *comm = NULL;
+ (void) get_process_comm(main_pid, &comm);
+
+ log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid, strna(comm));
+ } else {
+ if (!main_pid_alien)
+ wait_for_exit = true;
+
+ if (r != -ESRCH && send_sighup)
+ (void) kill(main_pid, SIGHUP);
+ }
+ }
+
+ if (control_pid > 0) {
+ if (log_func)
+ log_func(control_pid, sig, u);
+
+ r = kill_and_sigcont(control_pid, sig);
+ if (r < 0 && r != -ESRCH) {
+ _cleanup_free_ char *comm = NULL;
+ (void) get_process_comm(control_pid, &comm);
+
+ log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid, strna(comm));
+ } else {
+ wait_for_exit = true;
+
+ if (r != -ESRCH && send_sighup)
+ (void) kill(control_pid, SIGHUP);
+ }
+ }
+
+ if (u->cgroup_path &&
+ (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ pid_set = unit_pid_set(main_pid, control_pid);
+ if (!pid_set)
+ return -ENOMEM;
+
+ r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
+ sig,
+ CGROUP_SIGCONT|CGROUP_IGNORE_SELF,
+ pid_set,
+ log_func, u);
+ if (r < 0) {
+ if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT))
+ log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ } else if (r > 0) {
+
+ /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if
+ * we are running in a container or if this is a delegation unit, simply because cgroup
+ * notification is unreliable in these cases. It doesn't work at all in containers, and outside
+ * of containers it can be confused easily by left-over directories in the cgroup — which
+ * however should not exist in non-delegated units. On the unified hierarchy that's different,
+ * there we get proper events. Hence rely on them. */
+
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 ||
+ (detect_container() == 0 && !unit_cgroup_delegate(u)))
+ wait_for_exit = true;
+
+ if (send_sighup) {
+ set_free(pid_set);
+
+ pid_set = unit_pid_set(main_pid, control_pid);
+ if (!pid_set)
+ return -ENOMEM;
+
+ (void) cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
+ SIGHUP,
+ CGROUP_IGNORE_SELF,
+ pid_set,
+ NULL, NULL);
+ }
+ }
+ }
+
+ return wait_for_exit;
+}
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) {
+ int r;
+
+ assert(u);
+ assert(path);
+
+ /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these
+ * paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the
+ * dependency came to be). However, we build a prefix table for all possible prefixes so that new
+ * appearing mount units can easily determine which units to make themselves a dependency of. */
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */
+ return 0;
+
+ _cleanup_free_ char *p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+
+ /* Use the canonical form of the path as the stored key. We call path_is_normalized()
+ * only after simplification, since path_is_normalized() rejects paths with '.'.
+ * path_is_normalized() also verifies that the path fits in PATH_MAX. */
+ path = path_simplify(p);
+
+ if (!path_is_normalized(path))
+ return -EPERM;
+
+ UnitDependencyInfo di = {
+ .origin_mask = mask
+ };
+
+ r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+ TAKE_PTR(p); /* path remains a valid pointer to the string stored in the hashmap */
+
+ char prefix[strlen(path) + 1];
+ PATH_FOREACH_PREFIX_MORE(prefix, path) {
+ Set *x;
+
+ x = hashmap_get(u->manager->units_requiring_mounts_for, prefix);
+ if (!x) {
+ _cleanup_free_ char *q = NULL;
+
+ r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ q = strdup(prefix);
+ if (!q)
+ return -ENOMEM;
+
+ x = set_new(NULL);
+ if (!x)
+ return -ENOMEM;
+
+ r = hashmap_put(u->manager->units_requiring_mounts_for, q, x);
+ if (r < 0) {
+ set_free(x);
+ return r;
+ }
+ q = NULL;
+ }
+
+ r = set_put(x, u);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_setup_exec_runtime(Unit *u) {
+ ExecRuntime **rt;
+ size_t offset;
+ Unit *other;
+ int r;
+
+ offset = UNIT_VTABLE(u)->exec_runtime_offset;
+ assert(offset > 0);
+
+ /* Check if there already is an ExecRuntime for this unit? */
+ rt = (ExecRuntime**) ((uint8_t*) u + offset);
+ if (*rt)
+ return 0;
+
+ /* Try to get it from somebody else */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_JOINS_NAMESPACE_OF) {
+ r = exec_runtime_acquire(u->manager, NULL, other->id, false, rt);
+ if (r == 1)
+ return 1;
+ }
+
+ return exec_runtime_acquire(u->manager, unit_get_exec_context(u), u->id, true, rt);
+}
+
+int unit_setup_dynamic_creds(Unit *u) {
+ ExecContext *ec;
+ DynamicCreds *dcreds;
+ size_t offset;
+
+ assert(u);
+
+ offset = UNIT_VTABLE(u)->dynamic_creds_offset;
+ assert(offset > 0);
+ dcreds = (DynamicCreds*) ((uint8_t*) u + offset);
+
+ ec = unit_get_exec_context(u);
+ assert(ec);
+
+ if (!ec->dynamic_user)
+ return 0;
+
+ return dynamic_creds_acquire(dcreds, u->manager, ec->user, ec->group);
+}
+
+bool unit_type_supported(UnitType t) {
+ if (_unlikely_(t < 0))
+ return false;
+ if (_unlikely_(t >= _UNIT_TYPE_MAX))
+ return false;
+
+ if (!unit_vtable[t]->supported)
+ return true;
+
+ return unit_vtable[t]->supported();
+}
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where) {
+ int r;
+
+ assert(u);
+ assert(where);
+
+ if (!unit_log_level_test(u, LOG_NOTICE))
+ return;
+
+ r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+ if (r > 0 || r == -ENOTDIR)
+ return;
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where);
+ return;
+ }
+
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where),
+ "WHERE=%s", where);
+}
+
+int unit_fail_if_noncanonical(Unit *u, const char* where) {
+ _cleanup_free_ char *canonical_where = NULL;
+ int r;
+
+ assert(u);
+ assert(where);
+
+ r = chase_symlinks(where, NULL, CHASE_NONEXISTENT, &canonical_where, NULL);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where);
+ return 0;
+ }
+
+ /* We will happily ignore a trailing slash (or any redundant slashes) */
+ if (path_equal(where, canonical_where))
+ return 0;
+
+ /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */
+ log_unit_struct(u, LOG_ERR,
+ "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where),
+ "WHERE=%s", where);
+
+ return -ELOOP;
+}
+
+bool unit_is_pristine(Unit *u) {
+ assert(u);
+
+ /* Check if the unit already exists or is already around, in a number of different ways. Note that to
+ * cater for unit types such as slice, we are generally fine with units that are marked UNIT_LOADED
+ * even though nothing was actually loaded, as those unit types don't require a file on disk.
+ *
+ * Note that we don't check for drop-ins here, because we allow drop-ins for transient units
+ * identically to non-transient units, both unit-specific and hierarchical. E.g. for a-b-c.service:
+ * service.d/….conf, a-.service.d/….conf, a-b-.service.d/….conf, a-b-c.service.d/….conf.
+ */
+
+ return IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) &&
+ !u->fragment_path &&
+ !u->source_path &&
+ !u->job &&
+ !u->merged_into;
+}
+
+pid_t unit_control_pid(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->control_pid)
+ return UNIT_VTABLE(u)->control_pid(u);
+
+ return 0;
+}
+
+pid_t unit_main_pid(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->main_pid)
+ return UNIT_VTABLE(u)->main_pid(u);
+
+ return 0;
+}
+
+static void unit_unref_uid_internal(
+ Unit *u,
+ uid_t *ref_uid,
+ bool destroy_now,
+ void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) {
+
+ assert(u);
+ assert(ref_uid);
+ assert(_manager_unref_uid);
+
+ /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and
+ * gid_t are actually the same time, with the same validity rules.
+ *
+ * Drops a reference to UID/GID from a unit. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (!uid_is_valid(*ref_uid))
+ return;
+
+ _manager_unref_uid(u->manager, *ref_uid, destroy_now);
+ *ref_uid = UID_INVALID;
+}
+
+static void unit_unref_uid(Unit *u, bool destroy_now) {
+ unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid);
+}
+
+static void unit_unref_gid(Unit *u, bool destroy_now) {
+ unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid);
+}
+
+void unit_unref_uid_gid(Unit *u, bool destroy_now) {
+ assert(u);
+
+ unit_unref_uid(u, destroy_now);
+ unit_unref_gid(u, destroy_now);
+}
+
+static int unit_ref_uid_internal(
+ Unit *u,
+ uid_t *ref_uid,
+ uid_t uid,
+ bool clean_ipc,
+ int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) {
+
+ int r;
+
+ assert(u);
+ assert(ref_uid);
+ assert(uid_is_valid(uid));
+ assert(_manager_ref_uid);
+
+ /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t
+ * are actually the same type, and have the same validity rules.
+ *
+ * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a
+ * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter
+ * drops to zero. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (*ref_uid == uid)
+ return 0;
+
+ if (uid_is_valid(*ref_uid)) /* Already set? */
+ return -EBUSY;
+
+ r = _manager_ref_uid(u->manager, uid, clean_ipc);
+ if (r < 0)
+ return r;
+
+ *ref_uid = uid;
+ return 1;
+}
+
+static int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) {
+ return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid);
+}
+
+static int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) {
+ return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid);
+}
+
+static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) {
+ int r = 0, q = 0;
+
+ assert(u);
+
+ /* Reference both a UID and a GID in one go. Either references both, or neither. */
+
+ if (uid_is_valid(uid)) {
+ r = unit_ref_uid(u, uid, clean_ipc);
+ if (r < 0)
+ return r;
+ }
+
+ if (gid_is_valid(gid)) {
+ q = unit_ref_gid(u, gid, clean_ipc);
+ if (q < 0) {
+ if (r > 0)
+ unit_unref_uid(u, false);
+
+ return q;
+ }
+ }
+
+ return r > 0 || q > 0;
+}
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) {
+ ExecContext *c;
+ int r;
+
+ assert(u);
+
+ c = unit_get_exec_context(u);
+
+ r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false);
+ if (r < 0)
+ return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m");
+
+ return r;
+}
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) {
+ int r;
+
+ assert(u);
+
+ /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names
+ * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC
+ * objects when no service references the UID/GID anymore. */
+
+ r = unit_ref_uid_gid(u, uid, gid);
+ if (r > 0)
+ unit_add_to_dbus_queue(u);
+}
+
+int unit_acquire_invocation_id(Unit *u) {
+ sd_id128_t id;
+ int r;
+
+ assert(u);
+
+ r = sd_id128_randomize(&id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m");
+
+ r = unit_set_invocation_id(u, id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m");
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+}
+
+int unit_set_exec_params(Unit *u, ExecParameters *p) {
+ int r;
+
+ assert(u);
+ assert(p);
+
+ /* Copy parameters from manager */
+ r = manager_get_effective_environment(u->manager, &p->environment);
+ if (r < 0)
+ return r;
+
+ p->confirm_spawn = manager_get_confirm_spawn(u->manager);
+ p->cgroup_supported = u->manager->cgroup_supported;
+ p->prefix = u->manager->prefix;
+ SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager));
+
+ /* Copy parameters from unit */
+ p->cgroup_path = u->cgroup_path;
+ SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u));
+
+ p->received_credentials_directory = u->manager->received_credentials_directory;
+ p->received_encrypted_credentials_directory = u->manager->received_encrypted_credentials_directory;
+
+ return 0;
+}
+
+int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret) {
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child,
+ * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */
+
+ (void) unit_realize_cgroup(u);
+
+ r = safe_fork(name, FORK_REOPEN_LOG, ret);
+ if (r != 0)
+ return r;
+
+ (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE);
+ (void) ignore_signals(SIGPIPE);
+
+ (void) prctl(PR_SET_PDEATHSIG, SIGTERM);
+
+ if (u->cgroup_path) {
+ r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path));
+ _exit(EXIT_CGROUP);
+ }
+ }
+
+ return 0;
+}
+
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, pid_t *ret_pid) {
+ pid_t pid;
+ int r;
+
+ assert(u);
+ assert(ret_pid);
+
+ r = unit_fork_helper_process(u, "(sd-rmrf)", &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ int ret = EXIT_SUCCESS;
+
+ STRV_FOREACH(i, paths) {
+ r = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK);
+ if (r < 0) {
+ log_error_errno(r, "Failed to remove '%s': %m", *i);
+ ret = EXIT_FAILURE;
+ }
+ }
+
+ _exit(ret);
+ }
+
+ r = unit_watch_pid(u, pid, true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = pid;
+ return 0;
+}
+
+static void unit_update_dependency_mask(Hashmap *deps, Unit *other, UnitDependencyInfo di) {
+ assert(deps);
+ assert(other);
+
+ if (di.origin_mask == 0 && di.destination_mask == 0)
+ /* No bit set anymore, let's drop the whole entry */
+ assert_se(hashmap_remove(deps, other));
+ else
+ /* Mask was reduced, let's update the entry */
+ assert_se(hashmap_update(deps, other, di.data) == 0);
+}
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) {
+ Hashmap *deps;
+ assert(u);
+
+ /* Removes all dependencies u has on other units marked for ownership by 'mask'. */
+
+ if (mask == 0)
+ return;
+
+ HASHMAP_FOREACH(deps, u->dependencies) {
+ bool done;
+
+ do {
+ UnitDependencyInfo di;
+ Unit *other;
+
+ done = true;
+
+ HASHMAP_FOREACH_KEY(di.data, other, deps) {
+ Hashmap *other_deps;
+
+ if (FLAGS_SET(~mask, di.origin_mask))
+ continue;
+
+ di.origin_mask &= ~mask;
+ unit_update_dependency_mask(deps, other, di);
+
+ /* We updated the dependency from our unit to the other unit now. But most
+ * dependencies imply a reverse dependency. Hence, let's delete that one
+ * too. For that we go through all dependency types on the other unit and
+ * delete all those which point to us and have the right mask set. */
+
+ HASHMAP_FOREACH(other_deps, other->dependencies) {
+ UnitDependencyInfo dj;
+
+ dj.data = hashmap_get(other_deps, u);
+ if (FLAGS_SET(~mask, dj.destination_mask))
+ continue;
+
+ dj.destination_mask &= ~mask;
+ unit_update_dependency_mask(other_deps, u, dj);
+ }
+
+ unit_add_to_gc_queue(other);
+
+ /* The unit 'other' may not be wanted by the unit 'u'. */
+ unit_submit_to_stop_when_unneeded_queue(other);
+
+ done = false;
+ break;
+ }
+
+ } while (!done);
+ }
+}
+
+static int unit_get_invocation_path(Unit *u, char **ret) {
+ char *p;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ p = strjoin("/run/systemd/units/invocation:", u->id);
+ else {
+ _cleanup_free_ char *user_path = NULL;
+ r = xdg_user_runtime_dir(&user_path, "/systemd/units/invocation:");
+ if (r < 0)
+ return r;
+ p = strjoin(user_path, u->id);
+ }
+
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return 0;
+}
+
+static int unit_export_invocation_id(Unit *u) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+
+ if (u->exported_invocation_id)
+ return 0;
+
+ if (sd_id128_is_null(u->invocation_id))
+ return 0;
+
+ r = unit_get_invocation_path(u, &p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to get invocation path: %m");
+
+ r = symlink_atomic_label(u->invocation_id_string, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p);
+
+ u->exported_invocation_id = true;
+ return 0;
+}
+
+static int unit_export_log_level_max(Unit *u, const ExecContext *c) {
+ const char *p;
+ char buf[2];
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_level_max)
+ return 0;
+
+ if (c->log_level_max < 0)
+ return 0;
+
+ assert(c->log_level_max <= 7);
+
+ buf[0] = '0' + c->log_level_max;
+ buf[1] = 0;
+
+ p = strjoina("/run/systemd/units/log-level-max:", u->id);
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p);
+
+ u->exported_log_level_max = true;
+ return 0;
+}
+
+static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) {
+ _cleanup_close_ int fd = -1;
+ struct iovec *iovec;
+ const char *p;
+ char *pattern;
+ le64_t *sizes;
+ ssize_t n;
+ int r;
+
+ if (u->exported_log_extra_fields)
+ return 0;
+
+ if (c->n_log_extra_fields <= 0)
+ return 0;
+
+ sizes = newa(le64_t, c->n_log_extra_fields);
+ iovec = newa(struct iovec, c->n_log_extra_fields * 2);
+
+ for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+ sizes[i] = htole64(c->log_extra_fields[i].iov_len);
+
+ iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t));
+ iovec[i*2+1] = c->log_extra_fields[i];
+ }
+
+ p = strjoina("/run/systemd/units/log-extra-fields:", u->id);
+ pattern = strjoina(p, ".XXXXXX");
+
+ fd = mkostemp_safe(pattern);
+ if (fd < 0)
+ return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p);
+
+ n = writev(fd, iovec, c->n_log_extra_fields*2);
+ if (n < 0) {
+ r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m");
+ goto fail;
+ }
+
+ (void) fchmod(fd, 0644);
+
+ if (rename(pattern, p) < 0) {
+ r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m");
+ goto fail;
+ }
+
+ u->exported_log_extra_fields = true;
+ return 0;
+
+fail:
+ (void) unlink(pattern);
+ return r;
+}
+
+static int unit_export_log_ratelimit_interval(Unit *u, const ExecContext *c) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_ratelimit_interval)
+ return 0;
+
+ if (c->log_ratelimit_interval_usec == 0)
+ return 0;
+
+ p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+
+ if (asprintf(&buf, "%" PRIu64, c->log_ratelimit_interval_usec) < 0)
+ return log_oom();
+
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p);
+
+ u->exported_log_ratelimit_interval = true;
+ return 0;
+}
+
+static int unit_export_log_ratelimit_burst(Unit *u, const ExecContext *c) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_ratelimit_burst)
+ return 0;
+
+ if (c->log_ratelimit_burst == 0)
+ return 0;
+
+ p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+
+ if (asprintf(&buf, "%u", c->log_ratelimit_burst) < 0)
+ return log_oom();
+
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p);
+
+ u->exported_log_ratelimit_burst = true;
+ return 0;
+}
+
+void unit_export_state_files(Unit *u) {
+ const ExecContext *c;
+
+ assert(u);
+
+ if (!u->id)
+ return;
+
+ if (MANAGER_IS_TEST_RUN(u->manager))
+ return;
+
+ /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data
+ * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as
+ * the IPC system itself and PID 1 also log to the journal.
+ *
+ * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as
+ * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really
+ * apply to communication between the journal and systemd, as we assume that these two daemons live in the same
+ * namespace at least.
+ *
+ * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work
+ * better for storing small bits of data, in particular as we can write them with two system calls, and read
+ * them with one. */
+
+ (void) unit_export_invocation_id(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ c = unit_get_exec_context(u);
+ if (c) {
+ (void) unit_export_log_level_max(u, c);
+ (void) unit_export_log_extra_fields(u, c);
+ (void) unit_export_log_ratelimit_interval(u, c);
+ (void) unit_export_log_ratelimit_burst(u, c);
+ }
+}
+
+void unit_unlink_state_files(Unit *u) {
+ const char *p;
+
+ assert(u);
+
+ if (!u->id)
+ return;
+
+ /* Undoes the effect of unit_export_state() */
+
+ if (u->exported_invocation_id) {
+ _cleanup_free_ char *invocation_path = NULL;
+ int r = unit_get_invocation_path(u, &invocation_path);
+ if (r >= 0) {
+ (void) unlink(invocation_path);
+ u->exported_invocation_id = false;
+ }
+ }
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ if (u->exported_log_level_max) {
+ p = strjoina("/run/systemd/units/log-level-max:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_level_max = false;
+ }
+
+ if (u->exported_log_extra_fields) {
+ p = strjoina("/run/systemd/units/extra-fields:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_extra_fields = false;
+ }
+
+ if (u->exported_log_ratelimit_interval) {
+ p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_ratelimit_interval = false;
+ }
+
+ if (u->exported_log_ratelimit_burst) {
+ p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_ratelimit_burst = false;
+ }
+}
+
+int unit_prepare_exec(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+ * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+ r = bpf_firewall_load_custom(u);
+ if (r < 0)
+ return r;
+
+ /* Prepares everything so that we can fork of a process for this unit */
+
+ (void) unit_realize_cgroup(u);
+
+ if (u->reset_accounting) {
+ (void) unit_reset_accounting(u);
+ u->reset_accounting = false;
+ }
+
+ unit_export_state_files(u);
+
+ r = unit_setup_exec_runtime(u);
+ if (r < 0)
+ return r;
+
+ r = unit_setup_dynamic_creds(u);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static bool ignore_leftover_process(const char *comm) {
+ return comm && comm[0] == '('; /* Most likely our own helper process (PAM?), ignore */
+}
+
+int unit_log_leftover_process_start(pid_t pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) get_process_comm(pid, &comm);
+
+ if (ignore_leftover_process(comm))
+ return 0;
+
+ /* During start we print a warning */
+
+ log_unit_warning(userdata,
+ "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n"
+ "This usually indicates unclean termination of a previous run, or service implementation deficiencies.",
+ pid, strna(comm));
+
+ return 1;
+}
+
+int unit_log_leftover_process_stop(pid_t pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) get_process_comm(pid, &comm);
+
+ if (ignore_leftover_process(comm))
+ return 0;
+
+ /* During stop we only print an informational message */
+
+ log_unit_info(userdata,
+ "Unit process " PID_FMT " (%s) remains running after unit stopped.",
+ pid, strna(comm));
+
+ return 1;
+}
+
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) {
+ assert(u);
+
+ (void) unit_pick_cgroup_path(u);
+
+ if (!u->cgroup_path)
+ return 0;
+
+ return cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, 0, 0, NULL, log_func, u);
+}
+
+bool unit_needs_console(Unit *u) {
+ ExecContext *ec;
+ UnitActiveState state;
+
+ assert(u);
+
+ state = unit_active_state(u);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(state))
+ return false;
+
+ if (UNIT_VTABLE(u)->needs_console)
+ return UNIT_VTABLE(u)->needs_console(u);
+
+ /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */
+ ec = unit_get_exec_context(u);
+ if (!ec)
+ return false;
+
+ return exec_context_may_touch_console(ec);
+}
+
+int unit_pid_attachable(Unit *u, pid_t pid, sd_bus_error *error) {
+ int r;
+
+ assert(u);
+
+ /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself,
+ * and not a kernel thread either */
+
+ /* First, a simple range check */
+ if (!pid_is_valid(pid))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier " PID_FMT " is not valid.", pid);
+
+ /* Some extra safety check */
+ if (pid == 1 || pid == getpid_cached())
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid);
+
+ /* Don't even begin to bother with kernel threads */
+ r = is_kernel_thread(pid);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid);
+ if (r > 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid);
+
+ return 0;
+}
+
+void unit_log_success(Unit *u) {
+ assert(u);
+
+ /* Let's show message "Deactivated successfully" in debug mode (when manager is user) rather than in info mode.
+ * This message has low information value for regular users and it might be a bit overwhelming on a system with
+ * a lot of devices. */
+ log_unit_struct(u,
+ MANAGER_IS_USER(u->manager) ? LOG_DEBUG : LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Deactivated successfully."));
+}
+
+void unit_log_failure(Unit *u, const char *result) {
+ assert(u);
+ assert(result);
+
+ log_unit_struct(u, LOG_WARNING,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result),
+ "UNIT_RESULT=%s", result);
+}
+
+void unit_log_skip(Unit *u, const char *result) {
+ assert(u);
+ assert(result);
+
+ log_unit_struct(u, LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_SKIPPED_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Skipped due to '%s'.", result),
+ "UNIT_RESULT=%s", result);
+}
+
+void unit_log_process_exit(
+ Unit *u,
+ const char *kind,
+ const char *command,
+ bool success,
+ int code,
+ int status) {
+
+ int level;
+
+ assert(u);
+ assert(kind);
+
+ /* If this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure
+ * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption
+ * that the service already logged the reason at a higher log level on its own. Otherwise, make it a
+ * WARNING. */
+ if (success)
+ level = LOG_DEBUG;
+ else if (code == CLD_EXITED)
+ level = LOG_NOTICE;
+ else
+ level = LOG_WARNING;
+
+ log_unit_struct(u, level,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR,
+ LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s%s",
+ kind,
+ sigchld_code_to_string(code), status,
+ strna(code == CLD_EXITED
+ ? exit_status_to_string(status, EXIT_STATUS_FULL)
+ : signal_to_string(status)),
+ success ? " (success)" : ""),
+ "EXIT_CODE=%s", sigchld_code_to_string(code),
+ "EXIT_STATUS=%i", status,
+ "COMMAND=%s", strna(command),
+ LOG_UNIT_INVOCATION_ID(u));
+}
+
+int unit_exit_status(Unit *u) {
+ assert(u);
+
+ /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range
+ * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA
+ * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main
+ * service process has exited abnormally (signal/coredump). */
+
+ if (!UNIT_VTABLE(u)->exit_status)
+ return -EOPNOTSUPP;
+
+ return UNIT_VTABLE(u)->exit_status(u);
+}
+
+int unit_failure_action_exit_status(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */
+
+ if (u->failure_action_exit_status >= 0)
+ return u->failure_action_exit_status;
+
+ r = unit_exit_status(u);
+ if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+ return 255;
+
+ return r;
+}
+
+int unit_success_action_exit_status(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */
+
+ if (u->success_action_exit_status >= 0)
+ return u->success_action_exit_status;
+
+ r = unit_exit_status(u);
+ if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+ return 255;
+
+ return r;
+}
+
+int unit_test_trigger_loaded(Unit *u) {
+ Unit *trigger;
+
+ /* Tests whether the unit to trigger is loaded */
+
+ trigger = UNIT_TRIGGER(u);
+ if (!trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+ "Refusing to start, no unit to trigger.");
+ if (trigger->load_state != UNIT_LOADED)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+ "Refusing to start, unit %s to trigger not loaded.", trigger->id);
+
+ return 0;
+}
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context) {
+ assert(u);
+ assert(context);
+
+ if (context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO ||
+ (context->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART && !unit_will_restart(u)))
+ exec_context_destroy_runtime_directory(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+
+ exec_context_destroy_credentials(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id);
+ exec_context_destroy_mount_ns_dir(u);
+}
+
+int unit_clean(Unit *u, ExecCleanMask mask) {
+ UnitActiveState state;
+
+ assert(u);
+
+ /* Special return values:
+ *
+ * -EOPNOTSUPP → cleaning not supported for this unit type
+ * -EUNATCH → cleaning not defined for this resource type
+ * -EBUSY → unit currently can't be cleaned since it's running or not properly loaded, or has
+ * a job queued or similar
+ */
+
+ if (!UNIT_VTABLE(u)->clean)
+ return -EOPNOTSUPP;
+
+ if (mask == 0)
+ return -EUNATCH;
+
+ if (u->load_state != UNIT_LOADED)
+ return -EBUSY;
+
+ if (u->job)
+ return -EBUSY;
+
+ state = unit_active_state(u);
+ if (!IN_SET(state, UNIT_INACTIVE))
+ return -EBUSY;
+
+ return UNIT_VTABLE(u)->clean(u, mask);
+}
+
+int unit_can_clean(Unit *u, ExecCleanMask *ret) {
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->clean ||
+ u->load_state != UNIT_LOADED) {
+ *ret = 0;
+ return 0;
+ }
+
+ /* When the clean() method is set, can_clean() really should be set too */
+ assert(UNIT_VTABLE(u)->can_clean);
+
+ return UNIT_VTABLE(u)->can_clean(u, ret);
+}
+
+bool unit_can_freeze(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->can_freeze)
+ return UNIT_VTABLE(u)->can_freeze(u);
+
+ return UNIT_VTABLE(u)->freeze;
+}
+
+void unit_frozen(Unit *u) {
+ assert(u);
+
+ u->freezer_state = FREEZER_FROZEN;
+
+ bus_unit_send_pending_freezer_message(u, false);
+}
+
+void unit_thawed(Unit *u) {
+ assert(u);
+
+ u->freezer_state = FREEZER_RUNNING;
+
+ bus_unit_send_pending_freezer_message(u, false);
+}
+
+static int unit_freezer_action(Unit *u, FreezerAction action) {
+ UnitActiveState s;
+ int (*method)(Unit*);
+ int r;
+
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw;
+ if (!method || !cg_freezer_supported())
+ return -EOPNOTSUPP;
+
+ if (u->job)
+ return -EBUSY;
+
+ if (u->load_state != UNIT_LOADED)
+ return -EHOSTDOWN;
+
+ s = unit_active_state(u);
+ if (s != UNIT_ACTIVE)
+ return -EHOSTDOWN;
+
+ if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) ||
+ (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW))
+ return -EALREADY;
+
+ r = method(u);
+ if (r <= 0)
+ return r;
+
+ assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING));
+
+ return 1;
+}
+
+int unit_freeze(Unit *u) {
+ return unit_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw(Unit *u) {
+ return unit_freezer_action(u, FREEZER_THAW);
+}
+
+/* Wrappers around low-level cgroup freezer operations common for service and scope units */
+int unit_freeze_vtable_common(Unit *u) {
+ return unit_cgroup_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw_vtable_common(Unit *u) {
+ return unit_cgroup_freezer_action(u, FREEZER_THAW);
+}
+
+Condition *unit_find_failed_condition(Unit *u) {
+ Condition *failed_trigger = NULL;
+ bool has_succeeded_trigger = false;
+
+ if (u->condition_result)
+ return NULL;
+
+ LIST_FOREACH(conditions, c, u->conditions)
+ if (c->trigger) {
+ if (c->result == CONDITION_SUCCEEDED)
+ has_succeeded_trigger = true;
+ else if (!failed_trigger)
+ failed_trigger = c;
+ } else if (c->result != CONDITION_SUCCEEDED)
+ return c;
+
+ return failed_trigger && !has_succeeded_trigger ? failed_trigger : NULL;
+}
+
+static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
+ [COLLECT_INACTIVE] = "inactive",
+ [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode);
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other) {
+ Unit *i;
+
+ assert(u);
+
+ /* Checks if the unit has a dependency on 'other' with the specified dependency atom. If 'other' is
+ * NULL checks if the unit has *any* dependency of that atom. Returns 'other' if found (or if 'other'
+ * is NULL the first entry found), or NULL if not found. */
+
+ UNIT_FOREACH_DEPENDENCY(i, u, atom)
+ if (!other || other == i)
+ return i;
+
+ return NULL;
+}
+
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array) {
+ _cleanup_free_ Unit **array = NULL;
+ size_t n = 0;
+ Unit *other;
+
+ assert(u);
+ assert(ret_array);
+
+ /* Gets a list of units matching a specific atom as array. This is useful when iterating through
+ * dependencies while modifying them: the array is an "atomic snapshot" of sorts, that can be read
+ * while the dependency table is continuously updated. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ if (!GREEDY_REALLOC(array, n + 1))
+ return -ENOMEM;
+
+ array[n++] = other;
+ }
+
+ *ret_array = TAKE_PTR(array);
+
+ assert(n <= INT_MAX);
+ return (int) n;
+}
+
+const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = {
+ [UNIT_PATH] = &activation_details_path_vtable,
+ [UNIT_TIMER] = &activation_details_timer_vtable,
+};
+
+ActivationDetails *activation_details_new(Unit *trigger_unit) {
+ _cleanup_free_ ActivationDetails *details = NULL;
+
+ assert(trigger_unit);
+ assert(trigger_unit->type != _UNIT_TYPE_INVALID);
+ assert(trigger_unit->id);
+
+ details = malloc0(activation_details_vtable[trigger_unit->type]->object_size);
+ if (!details)
+ return NULL;
+
+ *details = (ActivationDetails) {
+ .n_ref = 1,
+ .trigger_unit_type = trigger_unit->type,
+ };
+
+ details->trigger_unit_name = strdup(trigger_unit->id);
+ if (!details->trigger_unit_name)
+ return NULL;
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->init)
+ ACTIVATION_DETAILS_VTABLE(details)->init(details, trigger_unit);
+
+ return TAKE_PTR(details);
+}
+
+static ActivationDetails *activation_details_free(ActivationDetails *details) {
+ if (!details)
+ return NULL;
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->done)
+ ACTIVATION_DETAILS_VTABLE(details)->done(details);
+
+ free(details->trigger_unit_name);
+
+ return mfree(details);
+}
+
+void activation_details_serialize(ActivationDetails *details, FILE *f) {
+ if (!details || details->trigger_unit_type == _UNIT_TYPE_INVALID)
+ return;
+
+ (void) serialize_item(f, "activation-details-unit-type", unit_type_to_string(details->trigger_unit_type));
+ if (details->trigger_unit_name)
+ (void) serialize_item(f, "activation-details-unit-name", details->trigger_unit_name);
+ if (ACTIVATION_DETAILS_VTABLE(details)->serialize)
+ ACTIVATION_DETAILS_VTABLE(details)->serialize(details, f);
+}
+
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ assert(key);
+ assert(value);
+ assert(details);
+
+ if (!*details) {
+ UnitType t;
+
+ if (!streq(key, "activation-details-unit-type"))
+ return -EINVAL;
+
+ t = unit_type_from_string(value);
+ /* The activation details vtable has defined ops only for path
+ * and timer units */
+ if (!IN_SET(t, UNIT_PATH, UNIT_TIMER))
+ return -EINVAL;
+
+ *details = malloc0(activation_details_vtable[t]->object_size);
+ if (!*details)
+ return -ENOMEM;
+
+ **details = (ActivationDetails) {
+ .n_ref = 1,
+ .trigger_unit_type = t,
+ };
+
+ return 0;
+ }
+
+ if (streq(key, "activation-details-unit-name")) {
+ (*details)->trigger_unit_name = strdup(value);
+ if (!(*details)->trigger_unit_name)
+ return -ENOMEM;
+
+ return 0;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(*details)->deserialize)
+ return ACTIVATION_DETAILS_VTABLE(*details)->deserialize(key, value, details);
+
+ return -EINVAL;
+}
+
+int activation_details_append_env(ActivationDetails *details, char ***strv) {
+ int r = 0;
+
+ assert(strv);
+
+ if (!details)
+ return 0;
+
+ if (!isempty(details->trigger_unit_name)) {
+ char *s = strjoin("TRIGGER_UNIT=", details->trigger_unit_name);
+ if (!s)
+ return -ENOMEM;
+
+ r = strv_consume(strv, TAKE_PTR(s));
+ if (r < 0)
+ return r;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->append_env) {
+ r = ACTIVATION_DETAILS_VTABLE(details)->append_env(details, strv);
+ if (r < 0)
+ return r;
+ }
+
+ return r + !isempty(details->trigger_unit_name); /* Return the number of variables added to the env block */
+}
+
+int activation_details_append_pair(ActivationDetails *details, char ***strv) {
+ int r = 0;
+
+ assert(strv);
+
+ if (!details)
+ return 0;
+
+ if (!isempty(details->trigger_unit_name)) {
+ r = strv_extend(strv, "trigger_unit");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, details->trigger_unit_name);
+ if (r < 0)
+ return r;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->append_pair) {
+ r = ACTIVATION_DETAILS_VTABLE(details)->append_pair(details, strv);
+ if (r < 0)
+ return r;
+ }
+
+ return r + !isempty(details->trigger_unit_name); /* Return the number of pairs added to the strv */
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free);
diff --git a/src/core/unit.h b/src/core/unit.h
new file mode 100644
index 0000000..65a1d26
--- /dev/null
+++ b/src/core/unit.h
@@ -0,0 +1,1194 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+
+#include "bpf-program.h"
+#include "condition.h"
+#include "emergency-action.h"
+#include "list.h"
+#include "show-status.h"
+#include "set.h"
+#include "unit-file.h"
+#include "cgroup.h"
+
+typedef struct UnitRef UnitRef;
+
+typedef enum KillOperation {
+ KILL_TERMINATE,
+ KILL_TERMINATE_AND_LOG,
+ KILL_RESTART,
+ KILL_KILL,
+ KILL_WATCHDOG,
+ _KILL_OPERATION_MAX,
+ _KILL_OPERATION_INVALID = -EINVAL,
+} KillOperation;
+
+typedef enum CollectMode {
+ COLLECT_INACTIVE,
+ COLLECT_INACTIVE_OR_FAILED,
+ _COLLECT_MODE_MAX,
+ _COLLECT_MODE_INVALID = -EINVAL,
+} CollectMode;
+
+static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
+ return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) {
+ return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) {
+ return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) {
+ return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED);
+}
+
+static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) {
+ return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED;
+}
+
+/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We
+ * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be
+ * created as a result of multiple "reasons", hence the bitmask. */
+typedef enum UnitDependencyMask {
+ /* Configured directly by the unit file, .wants/.requires symlink or drop-in, or as an immediate result of a
+ * non-dependency option configured that way. */
+ UNIT_DEPENDENCY_FILE = 1 << 0,
+
+ /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and
+ * type) */
+ UNIT_DEPENDENCY_IMPLICIT = 1 << 1,
+
+ /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually
+ * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only
+ * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */
+ UNIT_DEPENDENCY_DEFAULT = 1 << 2,
+
+ /* A dependency created from udev rules */
+ UNIT_DEPENDENCY_UDEV = 1 << 3,
+
+ /* A dependency created because of some unit's RequiresMountsFor= setting */
+ UNIT_DEPENDENCY_PATH = 1 << 4,
+
+ /* A dependency initially configured from the mount unit file however the dependency will be updated
+ * from /proc/self/mountinfo as soon as the kernel will make the entry for that mount available in
+ * the /proc file */
+ UNIT_DEPENDENCY_MOUNT_FILE = 1 << 5,
+
+ /* A dependency created or updated because of data read from /proc/self/mountinfo */
+ UNIT_DEPENDENCY_MOUNTINFO = 1 << 6,
+
+ /* A dependency created because of data read from /proc/swaps and no other configuration source */
+ UNIT_DEPENDENCY_PROC_SWAP = 1 << 7,
+
+ /* A dependency for units in slices assigned by directly setting Slice= */
+ UNIT_DEPENDENCY_SLICE_PROPERTY = 1 << 8,
+
+ _UNIT_DEPENDENCY_MASK_FULL = (1 << 9) - 1,
+} UnitDependencyMask;
+
+/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can
+ * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin
+ * and the destination of a dependency might have created it. */
+typedef union UnitDependencyInfo {
+ void *data;
+ struct {
+ UnitDependencyMask origin_mask:16;
+ UnitDependencyMask destination_mask:16;
+ } _packed_;
+} UnitDependencyInfo;
+
+/* Store information about why a unit was activated.
+ * We start with trigger units (.path/.timer), eventually it will be expanded to include more metadata. */
+typedef struct ActivationDetails {
+ unsigned n_ref;
+ UnitType trigger_unit_type;
+ char *trigger_unit_name;
+} ActivationDetails;
+
+/* For casting an activation event into the various unit-specific types */
+#define DEFINE_ACTIVATION_DETAILS_CAST(UPPERCASE, MixedCase, UNIT_TYPE) \
+ static inline MixedCase* UPPERCASE(ActivationDetails *a) { \
+ if (_unlikely_(!a || a->trigger_unit_type != UNIT_##UNIT_TYPE)) \
+ return NULL; \
+ \
+ return (MixedCase*) a; \
+ }
+
+/* For casting the various unit types into a unit */
+#define ACTIVATION_DETAILS(u) \
+ ({ \
+ typeof(u) _u_ = (u); \
+ ActivationDetails *_w_ = _u_ ? &(_u_)->meta : NULL; \
+ _w_; \
+ })
+
+ActivationDetails *activation_details_new(Unit *trigger_unit);
+ActivationDetails *activation_details_ref(ActivationDetails *p);
+ActivationDetails *activation_details_unref(ActivationDetails *p);
+void activation_details_serialize(ActivationDetails *p, FILE *f);
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **info);
+int activation_details_append_env(ActivationDetails *info, char ***strv);
+int activation_details_append_pair(ActivationDetails *info, char ***strv);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ActivationDetails*, activation_details_unref);
+
+typedef struct ActivationDetailsVTable {
+ /* How much memory does an object of this activation type need */
+ size_t object_size;
+
+ /* This should reset all type-specific variables. This should not allocate memory, and is called
+ * with zero-initialized data. It should hence only initialize variables that need to be set != 0. */
+ void (*init)(ActivationDetails *info, Unit *trigger_unit);
+
+ /* This should free all type-specific variables. It should be idempotent. */
+ void (*done)(ActivationDetails *info);
+
+ /* This should serialize all type-specific variables. */
+ void (*serialize)(ActivationDetails *info, FILE *f);
+
+ /* This should deserialize all type-specific variables, one at a time. */
+ int (*deserialize)(const char *key, const char *value, ActivationDetails **info);
+
+ /* This should format the type-specific variables for the env block of the spawned service,
+ * and return the number of added items. */
+ int (*append_env)(ActivationDetails *info, char ***strv);
+
+ /* This should append type-specific variables as key/value pairs for the D-Bus property of the job,
+ * and return the number of added pairs. */
+ int (*append_pair)(ActivationDetails *info, char ***strv);
+} ActivationDetailsVTable;
+
+extern const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX];
+
+static inline const ActivationDetailsVTable* ACTIVATION_DETAILS_VTABLE(const ActivationDetails *a) {
+ assert(a);
+ assert(a->trigger_unit_type < _UNIT_TYPE_MAX);
+
+ return activation_details_vtable[a->trigger_unit_type];
+}
+
+/* Newer LLVM versions don't like implicit casts from large pointer types to smaller enums, hence let's add
+ * explicit type-safe helpers for that. */
+static inline UnitDependency UNIT_DEPENDENCY_FROM_PTR(const void *p) {
+ return PTR_TO_INT(p);
+}
+
+static inline void* UNIT_DEPENDENCY_TO_PTR(UnitDependency d) {
+ return INT_TO_PTR(d);
+}
+
+#include "job.h"
+
+struct UnitRef {
+ /* Keeps tracks of references to a unit. This is useful so
+ * that we can merge two units if necessary and correct all
+ * references to them */
+
+ Unit *source, *target;
+ LIST_FIELDS(UnitRef, refs_by_target);
+};
+
+typedef struct Unit {
+ Manager *manager;
+
+ UnitType type;
+ UnitLoadState load_state;
+ Unit *merged_into;
+
+ char *id; /* The one special name that we use for identification */
+ char *instance;
+
+ Set *aliases; /* All the other names. */
+
+ /* For each dependency type we can look up another Hashmap with this, whose key is a Unit* object,
+ * and whose value encodes why the dependency exists, using the UnitDependencyInfo type. i.e. a
+ * Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */
+ Hashmap *dependencies;
+
+ /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the
+ * UnitDependencyInfo type */
+ Hashmap *requires_mounts_for;
+
+ char *description;
+ char **documentation;
+
+ /* The SELinux context used for checking access to this unit read off the unit file at load time (do
+ * not confuse with the selinux_context field in ExecContext which is the SELinux context we'll set
+ * for processes) */
+ char *access_selinux_context;
+
+ char *fragment_path; /* if loaded from a config file this is the primary path to it */
+ char *source_path; /* if converted, the source file */
+ char **dropin_paths;
+
+ usec_t fragment_not_found_timestamp_hash;
+ usec_t fragment_mtime;
+ usec_t source_mtime;
+ usec_t dropin_mtime;
+
+ /* If this is a transient unit we are currently writing, this is where we are writing it to */
+ FILE *transient_file;
+
+ /* Freezer state */
+ sd_bus_message *pending_freezer_message;
+ FreezerState freezer_state;
+
+ /* Job timeout and action to take */
+ EmergencyAction job_timeout_action;
+ usec_t job_timeout;
+ usec_t job_running_timeout;
+ char *job_timeout_reboot_arg;
+
+ /* If there is something to do with this unit, then this is the installed job for it */
+ Job *job;
+
+ /* JOB_NOP jobs are special and can be installed without disturbing the real job. */
+ Job *nop_job;
+
+ /* The slot used for watching NameOwnerChanged signals */
+ sd_bus_slot *match_bus_slot;
+ sd_bus_slot *get_name_owner_slot;
+
+ /* References to this unit from clients */
+ sd_bus_track *bus_track;
+ char **deserialized_refs;
+
+ /* References to this */
+ LIST_HEAD(UnitRef, refs_by_target);
+
+ /* Conditions to check */
+ LIST_HEAD(Condition, conditions);
+ LIST_HEAD(Condition, asserts);
+
+ dual_timestamp condition_timestamp;
+ dual_timestamp assert_timestamp;
+
+ /* Updated whenever the low-level state changes */
+ dual_timestamp state_change_timestamp;
+
+ /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */
+ dual_timestamp inactive_exit_timestamp;
+ dual_timestamp active_enter_timestamp;
+ dual_timestamp active_exit_timestamp;
+ dual_timestamp inactive_enter_timestamp;
+
+ /* Per type list */
+ LIST_FIELDS(Unit, units_by_type);
+
+ /* Load queue */
+ LIST_FIELDS(Unit, load_queue);
+
+ /* D-Bus queue */
+ LIST_FIELDS(Unit, dbus_queue);
+
+ /* Cleanup queue */
+ LIST_FIELDS(Unit, cleanup_queue);
+
+ /* GC queue */
+ LIST_FIELDS(Unit, gc_queue);
+
+ /* CGroup realize members queue */
+ LIST_FIELDS(Unit, cgroup_realize_queue);
+
+ /* cgroup empty queue */
+ LIST_FIELDS(Unit, cgroup_empty_queue);
+
+ /* cgroup OOM queue */
+ LIST_FIELDS(Unit, cgroup_oom_queue);
+
+ /* Target dependencies queue */
+ LIST_FIELDS(Unit, target_deps_queue);
+
+ /* Queue of units with StopWhenUnneeded= set that shall be checked for clean-up. */
+ LIST_FIELDS(Unit, stop_when_unneeded_queue);
+
+ /* Queue of units that have an Uphold= dependency from some other unit, and should be checked for starting */
+ LIST_FIELDS(Unit, start_when_upheld_queue);
+
+ /* Queue of units that have a BindTo= dependency on some other unit, and should possibly be shut down */
+ LIST_FIELDS(Unit, stop_when_bound_queue);
+
+ /* PIDs we keep an eye on. Note that a unit might have many
+ * more, but these are the ones we care enough about to
+ * process SIGCHLD for */
+ Set *pids;
+
+ /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event
+ * multiple times on the same unit. */
+ unsigned sigchldgen;
+ unsigned notifygen;
+
+ /* Used during GC sweeps */
+ unsigned gc_marker;
+
+ /* Error code when we didn't manage to load the unit (negative) */
+ int load_error;
+
+ /* Put a ratelimit on unit starting */
+ RateLimit start_ratelimit;
+ EmergencyAction start_limit_action;
+
+ /* The unit has been marked for reload, restart, etc. Stored as 1u << marker1 | 1u << marker2. */
+ unsigned markers;
+
+ /* What to do on failure or success */
+ EmergencyAction success_action, failure_action;
+ int success_action_exit_status, failure_action_exit_status;
+ char *reboot_arg;
+
+ /* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */
+ RateLimit auto_start_stop_ratelimit;
+ sd_event_source *auto_start_stop_event_source;
+
+ /* Reference to a specific UID/GID */
+ uid_t ref_uid;
+ gid_t ref_gid;
+
+ /* Cached unit file state and preset */
+ UnitFileState unit_file_state;
+ int unit_file_preset;
+
+ /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
+ nsec_t cpu_usage_base;
+ nsec_t cpu_usage_last; /* the most recently read value */
+
+ /* The current counter of OOM kills initiated by systemd-oomd */
+ uint64_t managed_oom_kill_last;
+
+ /* The current counter of the oom_kill field in the memory.events cgroup attribute */
+ uint64_t oom_kill_last;
+
+ /* Where the io.stat data was at the time the unit was started */
+ uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+ uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */
+
+ /* Counterparts in the cgroup filesystem */
+ char *cgroup_path;
+ uint64_t cgroup_id;
+ CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
+ CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
+ CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */
+ CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
+
+ /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
+ int cgroup_control_inotify_wd;
+ int cgroup_memory_inotify_wd;
+
+ /* Device Controller BPF program */
+ BPFProgram *bpf_device_control_installed;
+
+ /* IP BPF Firewalling/accounting */
+ int ip_accounting_ingress_map_fd;
+ int ip_accounting_egress_map_fd;
+ uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
+
+ int ipv4_allow_map_fd;
+ int ipv6_allow_map_fd;
+ int ipv4_deny_map_fd;
+ int ipv6_deny_map_fd;
+ BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+ BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
+
+ Set *ip_bpf_custom_ingress;
+ Set *ip_bpf_custom_ingress_installed;
+ Set *ip_bpf_custom_egress;
+ Set *ip_bpf_custom_egress_installed;
+
+ /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd,
+ * attached to unit cgroup by provided program fd and attach type. */
+ Hashmap *bpf_foreign_by_key;
+
+ FDSet *initial_socket_bind_link_fds;
+#if BPF_FRAMEWORK
+ /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and
+ * responsible for allowing or denying a unit to bind(2) to a socket
+ * address. */
+ struct bpf_link *ipv4_socket_bind_link;
+ struct bpf_link *ipv6_socket_bind_link;
+#endif
+
+ FDSet *initial_restric_ifaces_link_fds;
+#if BPF_FRAMEWORK
+ struct bpf_link *restrict_ifaces_ingress_bpf_link;
+ struct bpf_link *restrict_ifaces_egress_bpf_link;
+#endif
+
+ /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
+ * ones which might have appeared. */
+ sd_event_source *rewatch_pids_event_source;
+
+ /* How to start OnSuccess=/OnFailure= units */
+ JobMode on_success_job_mode;
+ JobMode on_failure_job_mode;
+
+ /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+ ActivationDetails *activation_details;
+
+ /* Tweaking the GC logic */
+ CollectMode collect_mode;
+
+ /* The current invocation ID */
+ sd_id128_t invocation_id;
+ char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */
+
+ /* Garbage collect us we nobody wants or requires us anymore */
+ bool stop_when_unneeded;
+
+ /* Create default dependencies */
+ bool default_dependencies;
+
+ /* Refuse manual starting, allow starting only indirectly via dependency. */
+ bool refuse_manual_start;
+
+ /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */
+ bool refuse_manual_stop;
+
+ /* Allow isolation requests */
+ bool allow_isolate;
+
+ /* Ignore this unit when isolating */
+ bool ignore_on_isolate;
+
+ /* Did the last condition check succeed? */
+ bool condition_result;
+ bool assert_result;
+
+ /* Is this a transient unit? */
+ bool transient;
+
+ /* Is this a unit that is always running and cannot be stopped? */
+ bool perpetual;
+
+ /* Booleans indicating membership of this unit in the various queues */
+ bool in_load_queue:1;
+ bool in_dbus_queue:1;
+ bool in_cleanup_queue:1;
+ bool in_gc_queue:1;
+ bool in_cgroup_realize_queue:1;
+ bool in_cgroup_empty_queue:1;
+ bool in_cgroup_oom_queue:1;
+ bool in_target_deps_queue:1;
+ bool in_stop_when_unneeded_queue:1;
+ bool in_start_when_upheld_queue:1;
+ bool in_stop_when_bound_queue:1;
+
+ bool sent_dbus_new_signal:1;
+
+ bool job_running_timeout_set:1;
+
+ bool in_audit:1;
+ bool on_console:1;
+
+ bool cgroup_realized:1;
+ bool cgroup_members_mask_valid:1;
+
+ /* Reset cgroup accounting next time we fork something off */
+ bool reset_accounting:1;
+
+ bool start_limit_hit:1;
+
+ /* Did we already invoke unit_coldplug() for this unit? */
+ bool coldplugged:1;
+
+ /* For transient units: whether to add a bus track reference after creating the unit */
+ bool bus_track_add:1;
+
+ /* Remember which unit state files we created */
+ bool exported_invocation_id:1;
+ bool exported_log_level_max:1;
+ bool exported_log_extra_fields:1;
+ bool exported_log_ratelimit_interval:1;
+ bool exported_log_ratelimit_burst:1;
+
+ /* Whether we warned about clamping the CPU quota period */
+ bool warned_clamping_cpu_quota_period:1;
+
+ /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If
+ * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */
+ signed int last_section_private:2;
+} Unit;
+
+typedef struct UnitStatusMessageFormats {
+ const char *starting_stopping[2];
+ const char *finished_start_job[_JOB_RESULT_MAX];
+ const char *finished_stop_job[_JOB_RESULT_MAX];
+ /* If this entry is present, it'll be called to provide a context-dependent format string,
+ * or NULL to fall back to finished_{start,stop}_job; if those are NULL too, fall back to generic. */
+ const char *(*finished_job)(Unit *u, JobType t, JobResult result);
+} UnitStatusMessageFormats;
+
+/* Flags used when writing drop-in files or transient unit files */
+typedef enum UnitWriteFlags {
+ /* Write a runtime unit file or drop-in (i.e. one below /run) */
+ UNIT_RUNTIME = 1 << 0,
+
+ /* Write a persistent drop-in (i.e. one below /etc) */
+ UNIT_PERSISTENT = 1 << 1,
+
+ /* Place this item in the per-unit-type private section, instead of [Unit] */
+ UNIT_PRIVATE = 1 << 2,
+
+ /* Apply specifier escaping before writing */
+ UNIT_ESCAPE_SPECIFIERS = 1 << 3,
+
+ /* Escape elements of ExecStart= syntax before writing */
+ UNIT_ESCAPE_EXEC_SYNTAX = 1 << 4,
+
+ /* Apply C escaping before writing */
+ UNIT_ESCAPE_C = 1 << 5,
+} UnitWriteFlags;
+
+/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */
+static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) {
+ return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0;
+}
+
+#include "kill.h"
+
+typedef struct UnitVTable {
+ /* How much memory does an object of this unit type need */
+ size_t object_size;
+
+ /* If greater than 0, the offset into the object where
+ * ExecContext is found, if the unit type has that */
+ size_t exec_context_offset;
+
+ /* If greater than 0, the offset into the object where
+ * CGroupContext is found, if the unit type has that */
+ size_t cgroup_context_offset;
+
+ /* If greater than 0, the offset into the object where
+ * KillContext is found, if the unit type has that */
+ size_t kill_context_offset;
+
+ /* If greater than 0, the offset into the object where the
+ * pointer to ExecRuntime is found, if the unit type has
+ * that */
+ size_t exec_runtime_offset;
+
+ /* If greater than 0, the offset into the object where the pointer to DynamicCreds is found, if the unit type
+ * has that. */
+ size_t dynamic_creds_offset;
+
+ /* The name of the configuration file section with the private settings of this unit */
+ const char *private_section;
+
+ /* Config file sections this unit type understands, separated
+ * by NUL chars */
+ const char *sections;
+
+ /* This should reset all type-specific variables. This should
+ * not allocate memory, and is called with zero-initialized
+ * data. It should hence only initialize variables that need
+ * to be set != 0. */
+ void (*init)(Unit *u);
+
+ /* This should free all type-specific variables. It should be
+ * idempotent. */
+ void (*done)(Unit *u);
+
+ /* Actually load data from disk. This may fail, and should set
+ * load_state to UNIT_LOADED, UNIT_MERGED or leave it at
+ * UNIT_STUB if no configuration could be found. */
+ int (*load)(Unit *u);
+
+ /* During deserialization we only record the intended state to return to. With coldplug() we actually put the
+ * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that
+ * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the
+ * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was
+ * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's
+ * what catchup() below is for. */
+ int (*coldplug)(Unit *u);
+
+ /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the
+ * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for
+ * example because they took place while we were reloading/reexecing) */
+ void (*catchup)(Unit *u);
+
+ void (*dump)(Unit *u, FILE *f, const char *prefix);
+
+ int (*start)(Unit *u);
+ int (*stop)(Unit *u);
+ int (*reload)(Unit *u);
+
+ int (*kill)(Unit *u, KillWho w, int signo, sd_bus_error *error);
+
+ /* Clear out the various runtime/state/cache/logs/configuration data */
+ int (*clean)(Unit *u, ExecCleanMask m);
+
+ /* Freeze the unit */
+ int (*freeze)(Unit *u);
+ int (*thaw)(Unit *u);
+ bool (*can_freeze)(Unit *u);
+
+ /* Return which kind of data can be cleaned */
+ int (*can_clean)(Unit *u, ExecCleanMask *ret);
+
+ bool (*can_reload)(Unit *u);
+
+ /* Serialize state and file descriptors that should be carried over into the new
+ * instance after reexecution. */
+ int (*serialize)(Unit *u, FILE *f, FDSet *fds);
+
+ /* Restore one item from the serialization */
+ int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds);
+
+ /* Try to match up fds with what we need for this unit */
+ void (*distribute_fds)(Unit *u, FDSet *fds);
+
+ /* Boils down the more complex internal state of this unit to
+ * a simpler one that the engine can understand */
+ UnitActiveState (*active_state)(Unit *u);
+
+ /* Returns the substate specific to this unit type as
+ * string. This is purely information so that we can give the
+ * user a more fine grained explanation in which actual state a
+ * unit is in. */
+ const char* (*sub_state_to_string)(Unit *u);
+
+ /* Additionally to UnitActiveState determine whether unit is to be restarted. */
+ bool (*will_restart)(Unit *u);
+
+ /* Return false when there is a reason to prevent this unit from being gc'ed
+ * even though nothing references it and it isn't active in any way. */
+ bool (*may_gc)(Unit *u);
+
+ /* Return true when the unit is not controlled by the manager (e.g. extrinsic mounts). */
+ bool (*is_extrinsic)(Unit *u);
+
+ /* When the unit is not running and no job for it queued we shall release its runtime resources */
+ void (*release_resources)(Unit *u);
+
+ /* Invoked on every child that died */
+ void (*sigchld_event)(Unit *u, pid_t pid, int code, int status);
+
+ /* Reset failed state if we are in failed state */
+ void (*reset_failed)(Unit *u);
+
+ /* Called whenever any of the cgroups this unit watches for ran empty */
+ void (*notify_cgroup_empty)(Unit *u);
+
+ /* Called whenever an OOM kill event on this unit was seen */
+ void (*notify_cgroup_oom)(Unit *u, bool managed_oom);
+
+ /* Called whenever a process of this unit sends us a message */
+ void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds);
+
+ /* Called whenever a name this Unit registered for comes or goes away. */
+ void (*bus_name_owner_change)(Unit *u, const char *new_owner);
+
+ /* Called for each property that is being set */
+ int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+
+ /* Called after at least one property got changed to apply the necessary change */
+ int (*bus_commit_properties)(Unit *u);
+
+ /* Return the unit this unit is following */
+ Unit *(*following)(Unit *u);
+
+ /* Return the set of units that are following each other */
+ int (*following_set)(Unit *u, Set **s);
+
+ /* Invoked each time a unit this unit is triggering changes
+ * state or gains/loses a job */
+ void (*trigger_notify)(Unit *u, Unit *trigger);
+
+ /* Called whenever CLOCK_REALTIME made a jump */
+ void (*time_change)(Unit *u);
+
+ /* Called whenever /etc/localtime was modified */
+ void (*timezone_change)(Unit *u);
+
+ /* Returns the next timeout of a unit */
+ int (*get_timeout)(Unit *u, usec_t *timeout);
+
+ /* Returns the main PID if there is any defined, or 0. */
+ pid_t (*main_pid)(Unit *u);
+
+ /* Returns the main PID if there is any defined, or 0. */
+ pid_t (*control_pid)(Unit *u);
+
+ /* Returns true if the unit currently needs access to the console */
+ bool (*needs_console)(Unit *u);
+
+ /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the
+ * exit code of the "main" process of the service or similar. */
+ int (*exit_status)(Unit *u);
+
+ /* Return a copy of the status string pointer. */
+ const char* (*status_text)(Unit *u);
+
+ /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that
+ * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is
+ * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those
+ * discovered through regular enumeration should be put in place by catchup(), see below. */
+ void (*enumerate_perpetual)(Manager *m);
+
+ /* This is called for each unit type and should be used to enumerate units already existing in the system
+ * internally and load them. However, everything that is loaded here should still stay in inactive state. It is
+ * the job of the catchup() call above to put the units into the discovered state. */
+ void (*enumerate)(Manager *m);
+
+ /* Type specific cleanups. */
+ void (*shutdown)(Manager *m);
+
+ /* If this function is set and returns false all jobs for units
+ * of this type will immediately fail. */
+ bool (*supported)(void);
+
+ /* If this function is set, it's invoked first as part of starting a unit to allow start rate
+ * limiting checks to occur before we do anything else. */
+ int (*can_start)(Unit *u);
+
+ /* The strings to print in status messages */
+ UnitStatusMessageFormats status_message_formats;
+
+ /* True if transient units of this type are OK */
+ bool can_transient;
+
+ /* True if cgroup delegation is permissible */
+ bool can_delegate;
+
+ /* True if the unit type triggers other units, i.e. can have a UNIT_TRIGGERS dependency */
+ bool can_trigger;
+
+ /* True if the unit type knows a failure state, and thus can be source of an OnFailure= dependency */
+ bool can_fail;
+
+ /* True if units of this type shall be startable only once and then never again */
+ bool once_only;
+
+ /* Do not serialize this unit when preparing for root switch */
+ bool exclude_from_switch_root_serialization;
+
+ /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */
+ bool gc_jobs;
+
+ /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroups */
+ bool can_set_managed_oom;
+} UnitVTable;
+
+extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX];
+
+static inline const UnitVTable* UNIT_VTABLE(const Unit *u) {
+ return unit_vtable[u->type];
+}
+
+/* For casting a unit into the various unit types */
+#define DEFINE_CAST(UPPERCASE, MixedCase) \
+ static inline MixedCase* UPPERCASE(Unit *u) { \
+ if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \
+ return NULL; \
+ \
+ return (MixedCase*) u; \
+ }
+
+/* For casting the various unit types into a unit */
+#define UNIT(u) \
+ ({ \
+ typeof(u) _u_ = (u); \
+ Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \
+ _w_; \
+ })
+
+#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0)
+#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0)
+#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0)
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other);
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array);
+
+static inline Hashmap* unit_get_dependencies(Unit *u, UnitDependency d) {
+ return hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+}
+
+static inline Unit* UNIT_TRIGGER(Unit *u) {
+ return unit_has_dependency(u, UNIT_ATOM_TRIGGERS, NULL);
+}
+
+static inline Unit* UNIT_GET_SLICE(const Unit *u) {
+ return unit_has_dependency(u, UNIT_ATOM_IN_SLICE, NULL);
+}
+
+Unit* unit_new(Manager *m, size_t size);
+Unit* unit_free(Unit *u);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free);
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret);
+int unit_add_name(Unit *u, const char *name);
+
+int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c);
+
+int unit_choose_id(Unit *u, const char *name);
+int unit_set_description(Unit *u, const char *description);
+
+bool unit_may_gc(Unit *u);
+
+static inline bool unit_is_extrinsic(Unit *u) {
+ return u->perpetual ||
+ (UNIT_VTABLE(u)->is_extrinsic && UNIT_VTABLE(u)->is_extrinsic(u));
+}
+
+static inline const char* unit_status_text(Unit *u) {
+ if (u && UNIT_VTABLE(u)->status_text)
+ return UNIT_VTABLE(u)->status_text(u);
+ return NULL;
+}
+
+void unit_add_to_load_queue(Unit *u);
+void unit_add_to_dbus_queue(Unit *u);
+void unit_add_to_cleanup_queue(Unit *u);
+void unit_add_to_gc_queue(Unit *u);
+void unit_add_to_target_deps_queue(Unit *u);
+void unit_submit_to_stop_when_unneeded_queue(Unit *u);
+void unit_submit_to_start_when_upheld_queue(Unit *u);
+void unit_submit_to_stop_when_bound_queue(Unit *u);
+
+int unit_merge(Unit *u, Unit *other);
+int unit_merge_by_name(Unit *u, const char *other);
+
+Unit *unit_follow_merge(Unit *u) _pure_;
+
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required);
+int unit_load(Unit *unit);
+
+int unit_set_slice(Unit *u, Unit *slice);
+int unit_set_default_slice(Unit *u);
+
+const char *unit_description(Unit *u) _pure_;
+const char *unit_status_string(Unit *u, char **combined);
+
+bool unit_has_name(const Unit *u, const char *name);
+
+UnitActiveState unit_active_state(Unit *u);
+FreezerState unit_freezer_state(Unit *u);
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret);
+
+const char* unit_sub_state_to_string(Unit *u);
+
+bool unit_can_reload(Unit *u) _pure_;
+bool unit_can_start(Unit *u) _pure_;
+bool unit_can_stop(Unit *u) _pure_;
+bool unit_can_isolate(Unit *u) _pure_;
+
+int unit_start(Unit *u, ActivationDetails *details);
+int unit_stop(Unit *u);
+int unit_reload(Unit *u);
+
+int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error);
+int unit_kill_common(Unit *u, KillWho who, int signo, pid_t main_pid, pid_t control_pid, sd_bus_error *error);
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom);
+
+typedef enum UnitNotifyFlags {
+ UNIT_NOTIFY_RELOAD_FAILURE = 1 << 0,
+ UNIT_NOTIFY_WILL_AUTO_RESTART = 1 << 1,
+} UnitNotifyFlags;
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlags flags);
+
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive);
+void unit_unwatch_pid(Unit *u, pid_t pid);
+void unit_unwatch_all_pids(Unit *u);
+
+int unit_enqueue_rewatch_pids(Unit *u);
+void unit_dequeue_rewatch_pids(Unit *u);
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name);
+int unit_watch_bus_name(Unit *u, const char *name);
+void unit_unwatch_bus_name(Unit *u, const char *name);
+
+bool unit_job_is_applicable(Unit *u, JobType j);
+
+int set_unit_path(const char *p);
+
+char *unit_dbus_path(Unit *u);
+char *unit_dbus_path_invocation_id(Unit *u);
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found);
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency d, UnitDependencyMask mask);
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask);
+
+int unit_coldplug(Unit *u);
+void unit_catchup(Unit *u);
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) _printf_(4, 0);
+
+bool unit_need_daemon_reload(Unit *u);
+
+void unit_reset_failed(Unit *u);
+
+Unit *unit_following(Unit *u);
+int unit_following_set(Unit *u, Set **s);
+
+const char *unit_slice_name(Unit *u);
+
+bool unit_stop_pending(Unit *u) _pure_;
+bool unit_inactive_or_pending(Unit *u) _pure_;
+bool unit_active_or_pending(Unit *u);
+bool unit_will_restart_default(Unit *u);
+bool unit_will_restart(Unit *u);
+
+int unit_add_default_target_dependency(Unit *u, Unit *target);
+
+void unit_start_on_failure(Unit *u, const char *dependency_name, UnitDependencyAtom atom, JobMode job_mode);
+void unit_trigger_notify(Unit *u);
+
+UnitFileState unit_get_unit_file_state(Unit *u);
+int unit_get_unit_file_preset(Unit *u);
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target);
+void unit_ref_unset(UnitRef *ref);
+
+#define UNIT_DEREF(ref) ((ref).target)
+#define UNIT_ISSET(ref) (!!(ref).target)
+
+int unit_patch_contexts(Unit *u);
+
+ExecContext *unit_get_exec_context(const Unit *u) _pure_;
+KillContext *unit_get_kill_context(Unit *u) _pure_;
+CGroupContext *unit_get_cgroup_context(Unit *u) _pure_;
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_;
+
+int unit_setup_exec_runtime(Unit *u);
+int unit_setup_dynamic_creds(Unit *u);
+
+char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf);
+char* unit_concat_strv(char **l, UnitWriteFlags flags);
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data);
+int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5);
+
+int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
+
+int unit_make_transient(Unit *u);
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask);
+
+bool unit_type_supported(UnitType t);
+
+bool unit_is_pristine(Unit *u);
+
+bool unit_is_unneeded(Unit *u);
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit);
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit);
+
+pid_t unit_control_pid(Unit *u);
+pid_t unit_main_pid(Unit *u);
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where);
+int unit_fail_if_noncanonical(Unit *u, const char* where);
+
+int unit_test_start_limit(Unit *u);
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid);
+void unit_unref_uid_gid(Unit *u, bool destroy_now);
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid);
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id);
+int unit_acquire_invocation_id(Unit *u);
+
+bool unit_shall_confirm_spawn(Unit *u);
+
+int unit_set_exec_params(Unit *s, ExecParameters *p);
+
+int unit_fork_helper_process(Unit *u, const char *name, pid_t *ret);
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, pid_t *ret_pid);
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
+
+void unit_export_state_files(Unit *u);
+void unit_unlink_state_files(Unit *u);
+
+int unit_prepare_exec(Unit *u);
+
+int unit_log_leftover_process_start(pid_t pid, int sig, void *userdata);
+int unit_log_leftover_process_stop(pid_t pid, int sig, void *userdata);
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func);
+
+bool unit_needs_console(Unit *u);
+
+int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error);
+
+static inline bool unit_has_job_type(Unit *u, JobType type) {
+ return u && u->job && u->job->type == type;
+}
+
+static inline bool unit_log_level_test(const Unit *u, int level) {
+ ExecContext *ec = unit_get_exec_context(u);
+ return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level);
+}
+
+/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done"
+ * after some execution, rather than succeeded or failed. */
+void unit_log_skip(Unit *u, const char *result);
+void unit_log_success(Unit *u);
+void unit_log_failure(Unit *u, const char *result);
+static inline void unit_log_result(Unit *u, bool success, const char *result) {
+ if (success)
+ unit_log_success(u);
+ else
+ unit_log_failure(u, result);
+}
+
+void unit_log_process_exit(Unit *u, const char *kind, const char *command, bool success, int code, int status);
+
+int unit_exit_status(Unit *u);
+int unit_success_action_exit_status(Unit *u);
+int unit_failure_action_exit_status(Unit *u);
+
+int unit_test_trigger_loaded(Unit *u);
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context);
+int unit_clean(Unit *u, ExecCleanMask mask);
+int unit_can_clean(Unit *u, ExecCleanMask *ret_mask);
+
+bool unit_can_freeze(Unit *u);
+int unit_freeze(Unit *u);
+void unit_frozen(Unit *u);
+
+int unit_thaw(Unit *u);
+void unit_thawed(Unit *u);
+
+int unit_freeze_vtable_common(Unit *u);
+int unit_thaw_vtable_common(Unit *u);
+
+Condition *unit_find_failed_condition(Unit *u);
+
+/* Macros which append UNIT= or USER_UNIT= to the message */
+
+#define log_unit_full_errno_zerook(unit, level, error, ...) \
+ ({ \
+ const Unit *_u = (unit); \
+ const int _l = (level); \
+ (log_get_max_level() < LOG_PRI(_l) || (_u && !unit_log_level_test(_u, _l))) ? -ERRNO_VALUE(error) : \
+ _u ? log_object_internal(_l, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
+ log_internal(_l, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
+ })
+
+#define log_unit_full_errno(unit, level, error, ...) \
+ ({ \
+ int _error = (error); \
+ ASSERT_NON_ZERO(_error); \
+ log_unit_full_errno_zerook(unit, level, _error, ##__VA_ARGS__); \
+ })
+
+#define log_unit_full(unit, level, ...) (void) log_unit_full_errno_zerook(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, __VA_ARGS__)
+#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, __VA_ARGS__)
+#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, __VA_ARGS__)
+#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, __VA_ARGS__)
+#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, __VA_ARGS__)
+
+#define log_unit_debug_errno(unit, error, ...) log_unit_full_errno(unit, LOG_DEBUG, error, __VA_ARGS__)
+#define log_unit_info_errno(unit, error, ...) log_unit_full_errno(unit, LOG_INFO, error, __VA_ARGS__)
+#define log_unit_notice_errno(unit, error, ...) log_unit_full_errno(unit, LOG_NOTICE, error, __VA_ARGS__)
+#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, __VA_ARGS__)
+#define log_unit_error_errno(unit, error, ...) log_unit_full_errno(unit, LOG_ERR, error, __VA_ARGS__)
+
+#if LOG_TRACE
+# define log_unit_trace(...) log_unit_debug(__VA_ARGS__)
+# define log_unit_trace_errno(...) log_unit_debug_errno(__VA_ARGS__)
+#else
+# define log_unit_trace(...) do {} while (0)
+# define log_unit_trace_errno(e, ...) (-ERRNO_VALUE(e))
+#endif
+
+#define log_unit_struct_errno(unit, level, error, ...) \
+ ({ \
+ const Unit *_u = (unit); \
+ const int _l = (level); \
+ unit_log_level_test(_u, _l) ? \
+ log_struct_errno(_l, error, __VA_ARGS__, LOG_UNIT_ID(_u)) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_unit_struct(unit, level, ...) log_unit_struct_errno(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_struct_iovec_errno(unit, level, error, iovec, n_iovec) \
+ ({ \
+ const int _l = (level); \
+ unit_log_level_test(unit, _l) ? \
+ log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_unit_struct_iovec(unit, level, iovec, n_iovec) log_unit_struct_iovec_errno(unit, level, 0, iovec, n_iovec)
+
+/* Like LOG_MESSAGE(), but with the unit name prefixed. */
+#define LOG_UNIT_MESSAGE(unit, fmt, ...) LOG_MESSAGE("%s: " fmt, (unit)->id, ##__VA_ARGS__)
+#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id
+#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string
+
+const char* collect_mode_to_string(CollectMode m) _const_;
+CollectMode collect_mode_from_string(const char *s) _pure_;
+
+typedef struct UnitForEachDependencyData {
+ /* Stores state for the FOREACH macro below for iterating through all deps that have any of the
+ * specified dependency atom bits set */
+ UnitDependencyAtom match_atom;
+ Hashmap *by_type, *by_unit;
+ void *current_type;
+ Iterator by_type_iterator, by_unit_iterator;
+ Unit **current_unit;
+} UnitForEachDependencyData;
+
+/* Iterates through all dependencies that have a specific atom in the dependency type set. This tries to be
+ * smart: if the atom is unique, we'll directly go to right entry. Otherwise we'll iterate through the
+ * per-dependency type hashmap and match all dep that have the right atom set. */
+#define _UNIT_FOREACH_DEPENDENCY(other, u, ma, data) \
+ for (UnitForEachDependencyData data = { \
+ .match_atom = (ma), \
+ .by_type = (u)->dependencies, \
+ .by_type_iterator = ITERATOR_FIRST, \
+ .current_unit = &(other), \
+ }; \
+ ({ \
+ UnitDependency _dt = _UNIT_DEPENDENCY_INVALID; \
+ bool _found; \
+ \
+ if (data.by_type && ITERATOR_IS_FIRST(data.by_type_iterator)) { \
+ _dt = unit_dependency_from_unique_atom(data.match_atom); \
+ if (_dt >= 0) { \
+ data.by_unit = hashmap_get(data.by_type, UNIT_DEPENDENCY_TO_PTR(_dt)); \
+ data.current_type = UNIT_DEPENDENCY_TO_PTR(_dt); \
+ data.by_type = NULL; \
+ _found = !!data.by_unit; \
+ } \
+ } \
+ if (_dt < 0) \
+ _found = hashmap_iterate(data.by_type, \
+ &data.by_type_iterator, \
+ (void**)&(data.by_unit), \
+ (const void**) &(data.current_type)); \
+ _found; \
+ }); ) \
+ if ((unit_dependency_to_atom(UNIT_DEPENDENCY_FROM_PTR(data.current_type)) & data.match_atom) != 0) \
+ for (data.by_unit_iterator = ITERATOR_FIRST; \
+ hashmap_iterate(data.by_unit, \
+ &data.by_unit_iterator, \
+ NULL, \
+ (const void**) data.current_unit); )
+
+/* Note: this matches deps that have *any* of the atoms specified in match_atom set */
+#define UNIT_FOREACH_DEPENDENCY(other, u, match_atom) \
+ _UNIT_FOREACH_DEPENDENCY(other, u, match_atom, UNIQ_T(data, UNIQ))
diff --git a/src/core/user.conf.in b/src/core/user.conf.in
new file mode 100644
index 0000000..b699749
--- /dev/null
+++ b/src/core/user.conf.in
@@ -0,0 +1,50 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file, or by creating "drop-ins" in
+# the user.conf.d/ subdirectory. The latter is generally recommended.
+# Defaults can be restored by simply deleting this file and all drop-ins.
+#
+# See systemd-user.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=auto
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=inherit
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec=90s
+#DefaultTimeoutStopSec=90s
+#DefaultTimeoutAbortSec=
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultSmackProcessLabel=