summaryrefslogtreecommitdiffstats
path: root/src/core
diff options
context:
space:
mode:
Diffstat (limited to 'src/core')
-rw-r--r--src/core/all-units.h15
-rw-r--r--src/core/apparmor-setup.c99
-rw-r--r--src/core/apparmor-setup.h4
-rw-r--r--src/core/audit-fd.c62
-rw-r--r--src/core/audit-fd.h5
-rw-r--r--src/core/automount.c1149
-rw-r--r--src/core/automount.h45
-rw-r--r--src/core/bpf-devices.c505
-rw-r--r--src/core/bpf-devices.h21
-rw-r--r--src/core/bpf-firewall.c974
-rw-r--r--src/core/bpf-firewall.h25
-rw-r--r--src/core/bpf-foreign.c154
-rw-r--r--src/core/bpf-foreign.h15
-rw-r--r--src/core/bpf-lsm.c320
-rw-r--r--src/core/bpf-lsm.h28
-rw-r--r--src/core/bpf-socket-bind.c244
-rw-r--r--src/core/bpf-socket-bind.h15
-rw-r--r--src/core/bpf-util.c36
-rw-r--r--src/core/bpf-util.h5
-rw-r--r--src/core/bpf/restrict_fs/meson.build24
-rw-r--r--src/core/bpf/restrict_fs/restrict-fs-skel.h14
-rw-r--r--src/core/bpf/restrict_fs/restrict-fs.bpf.c82
-rw-r--r--src/core/bpf/restrict_ifaces/meson.build24
-rw-r--r--src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h14
-rw-r--r--src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c52
-rw-r--r--src/core/bpf/socket_bind/meson.build24
-rw-r--r--src/core/bpf/socket_bind/socket-bind-api.bpf.h51
-rw-r--r--src/core/bpf/socket_bind/socket-bind-skel.h14
-rw-r--r--src/core/bpf/socket_bind/socket-bind.bpf.c111
-rw-r--r--src/core/cgroup.c4665
-rw-r--r--src/core/cgroup.h429
-rw-r--r--src/core/core-varlink.c652
-rw-r--r--src/core/core-varlink.h16
-rw-r--r--src/core/crash-handler.c193
-rw-r--r--src/core/crash-handler.h7
-rw-r--r--src/core/dbus-automount.c68
-rw-r--r--src/core/dbus-automount.h11
-rw-r--r--src/core/dbus-cgroup.c2287
-rw-r--r--src/core/dbus-cgroup.h15
-rw-r--r--src/core/dbus-device.c11
-rw-r--r--src/core/dbus-device.h6
-rw-r--r--src/core/dbus-execute.c3758
-rw-r--r--src/core/dbus-execute.h35
-rw-r--r--src/core/dbus-job.c374
-rw-r--r--src/core/dbus-job.h20
-rw-r--r--src/core/dbus-kill.c81
-rw-r--r--src/core/dbus-kill.h12
-rw-r--r--src/core/dbus-manager.c3628
-rw-r--r--src/core/dbus-manager.h18
-rw-r--r--src/core/dbus-mount.c174
-rw-r--r--src/core/dbus-mount.h12
-rw-r--r--src/core/dbus-path.c164
-rw-r--r--src/core/dbus-path.h11
-rw-r--r--src/core/dbus-scope.c318
-rw-r--r--src/core/dbus-scope.h19
-rw-r--r--src/core/dbus-service.c791
-rw-r--r--src/core/dbus-service.h15
-rw-r--r--src/core/dbus-slice.c34
-rw-r--r--src/core/dbus-slice.h12
-rw-r--r--src/core/dbus-socket.c470
-rw-r--r--src/core/dbus-socket.h12
-rw-r--r--src/core/dbus-swap.c55
-rw-r--r--src/core/dbus-swap.h16
-rw-r--r--src/core/dbus-target.c9
-rw-r--r--src/core/dbus-target.h6
-rw-r--r--src/core/dbus-timer.c364
-rw-r--r--src/core/dbus-timer.h11
-rw-r--r--src/core/dbus-unit.c2629
-rw-r--r--src/core/dbus-unit.h55
-rw-r--r--src/core/dbus-util.c286
-rw-r--r--src/core/dbus-util.h256
-rw-r--r--src/core/dbus.c1273
-rw-r--r--src/core/dbus.h37
-rw-r--r--src/core/device.c1301
-rw-r--r--src/core/device.h44
-rw-r--r--src/core/dynamic-user.c871
-rw-r--r--src/core/dynamic-user.h49
-rw-r--r--src/core/efi-random.c34
-rw-r--r--src/core/efi-random.h4
-rw-r--r--src/core/emergency-action.c224
-rw-r--r--src/core/emergency-action.h45
-rw-r--r--src/core/exec-credential.c1023
-rw-r--r--src/core/exec-credential.h54
-rw-r--r--src/core/exec-invoke.c5235
-rw-r--r--src/core/exec-invoke.h16
-rw-r--r--src/core/execute-serialize.c3896
-rw-r--r--src/core/execute-serialize.h23
-rw-r--r--src/core/execute.c2742
-rw-r--r--src/core/execute.h701
-rw-r--r--src/core/executor.c272
-rw-r--r--src/core/fuzz-execute-serialize.c89
-rw-r--r--src/core/fuzz-manager-serialize.c36
-rw-r--r--src/core/fuzz-manager-serialize.options2
-rw-r--r--src/core/fuzz-unit-file.c86
-rw-r--r--src/core/fuzz-unit-file.options2
-rw-r--r--src/core/generator-setup.c58
-rw-r--r--src/core/generator-setup.h8
-rw-r--r--src/core/ima-setup.c92
-rw-r--r--src/core/ima-setup.h9
-rw-r--r--src/core/import-creds.c938
-rw-r--r--src/core/import-creds.h4
-rw-r--r--src/core/job.c1712
-rw-r--r--src/core/job.h250
-rw-r--r--src/core/kill.c56
-rw-r--r--src/core/kill.h56
-rw-r--r--src/core/kmod-setup.c201
-rw-r--r--src/core/kmod-setup.h4
-rw-r--r--src/core/load-dropin.c130
-rw-r--r--src/core/load-dropin.h20
-rw-r--r--src/core/load-fragment-gperf-nulstr.awk16
-rw-r--r--src/core/load-fragment-gperf.gperf.in595
-rw-r--r--src/core/load-fragment.c6735
-rw-r--r--src/core/load-fragment.h165
-rw-r--r--src/core/main.c3227
-rw-r--r--src/core/main.h9
-rw-r--r--src/core/manager-dump.c119
-rw-r--r--src/core/manager-dump.h13
-rw-r--r--src/core/manager-serialize.c539
-rw-r--r--src/core/manager-serialize.h13
-rw-r--r--src/core/manager.c5039
-rw-r--r--src/core/manager.h646
-rw-r--r--src/core/meson.build260
-rw-r--r--src/core/mount.c2502
-rw-r--r--src/core/mount.h110
-rw-r--r--src/core/namespace.c3047
-rw-r--r--src/core/namespace.h200
-rw-r--r--src/core/org.freedesktop.systemd1.conf452
-rw-r--r--src/core/org.freedesktop.systemd1.policy.in83
-rw-r--r--src/core/org.freedesktop.systemd1.service13
-rw-r--r--src/core/path.c1075
-rw-r--r--src/core/path.h89
-rw-r--r--src/core/restrict-ifaces.c200
-rw-r--r--src/core/restrict-ifaces.h16
-rw-r--r--src/core/scope.c829
-rw-r--r--src/core/scope.h52
-rw-r--r--src/core/selinux-access.c288
-rw-r--r--src/core/selinux-access.h14
-rw-r--r--src/core/selinux-setup.c106
-rw-r--r--src/core/selinux-setup.h6
-rw-r--r--src/core/service.c5161
-rw-r--r--src/core/service.h290
-rw-r--r--src/core/show-status.c128
-rw-r--r--src/core/show-status.h44
-rw-r--r--src/core/slice.c462
-rw-r--r--src/core/slice.h18
-rw-r--r--src/core/smack-setup.c393
-rw-r--r--src/core/smack-setup.h10
-rw-r--r--src/core/socket.c3617
-rw-r--r--src/core/socket.h204
-rw-r--r--src/core/swap.c1680
-rw-r--r--src/core/swap.h103
-rw-r--r--src/core/system.conf.in83
-rw-r--r--src/core/systemd.pc.in108
-rw-r--r--src/core/target.c216
-rw-r--r--src/core/target.h16
-rw-r--r--src/core/timer.c1106
-rw-r--r--src/core/timer.h91
-rw-r--r--src/core/transaction.c1261
-rw-r--r--src/core/transaction.h51
-rw-r--r--src/core/unit-dependency-atom.c251
-rw-r--r--src/core/unit-dependency-atom.h92
-rw-r--r--src/core/unit-printf.c265
-rw-r--r--src/core/unit-printf.h26
-rw-r--r--src/core/unit-serialize.c890
-rw-r--r--src/core/unit-serialize.h16
-rw-r--r--src/core/unit.c6617
-rw-r--r--src/core/unit.h1249
-rw-r--r--src/core/user.conf.in59
168 files changed, 98698 insertions, 0 deletions
diff --git a/src/core/all-units.h b/src/core/all-units.h
new file mode 100644
index 0000000..fad814b
--- /dev/null
+++ b/src/core/all-units.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+#include "automount.h"
+#include "device.h"
+#include "path.h"
+#include "scope.h"
+#include "service.h"
+#include "slice.h"
+#include "socket.h"
+#include "swap.h"
+#include "target.h"
+#include "timer.h"
diff --git a/src/core/apparmor-setup.c b/src/core/apparmor-setup.c
new file mode 100644
index 0000000..3426a10
--- /dev/null
+++ b/src/core/apparmor-setup.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#if HAVE_APPARMOR
+# include <sys/apparmor.h>
+#endif
+#include <unistd.h>
+
+#include "apparmor-setup.h"
+#include "apparmor-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+
+#if HAVE_APPARMOR
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_policy_cache *, aa_policy_cache_unref, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(aa_features *, aa_features_unref, NULL);
+#endif
+
+int mac_apparmor_setup(void) {
+#if HAVE_APPARMOR
+ _cleanup_(aa_policy_cache_unrefp) aa_policy_cache *policy_cache = NULL;
+ _cleanup_(aa_features_unrefp) aa_features *features = NULL;
+ _cleanup_free_ char *current_profile = NULL, *cache_dir_path = NULL;
+ int r;
+
+ if (!mac_apparmor_use()) {
+ log_debug("AppArmor either not supported by the kernel or disabled.");
+ return 0;
+ }
+
+ /* To enable LSM stacking a patch to the kernel is proposed to create a
+ * per-LSM subdirectory to distinguish between the LSMs. Therefore, we
+ * read the file from the LSM specific directory first and only if that
+ * fails the one from the generic directory.
+ */
+ FOREACH_STRING(current_file, "/proc/self/attr/apparmor/current", "/proc/self/attr/current") {
+ r = read_one_line_file(current_file, &current_profile);
+ if (r == -ENOENT)
+ continue;
+ else if (r < 0)
+ log_warning_errno(r, "Failed to read current AppArmor profile from file %s, ignoring: %m", current_file);
+ else
+ break;
+ }
+ if (!current_profile) {
+ log_warning("Failed to get the current AppArmor profile of systemd from /proc/self/attr/apparmor/current or /proc/self/attr/current, ignoring.");
+ return 0;
+ }
+ if (!streq(current_profile, "unconfined")) {
+ log_debug("We are already confined in an AppArmor profile.");
+ return 0;
+ }
+
+ r = aa_features_new_from_kernel(&features);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to get the AppArmor feature set from the kernel, ignoring: %m");
+ return 0;
+ }
+ cache_dir_path = aa_policy_cache_dir_path_preview(features, AT_FDCWD, "/etc/apparmor/earlypolicy");
+ if (!cache_dir_path) {
+ log_debug_errno(errno, "Failed to get the path of the early AppArmor policy cache directory.");
+ return 0;
+ }
+
+ /* aa_policy_cache_new will internally use the same path as aa_policy_cache_dir_path_preview has returned. */
+ r = aa_policy_cache_new(&policy_cache, features, AT_FDCWD, "/etc/apparmor/earlypolicy", 0);
+ if (r < 0) {
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "The early AppArmor policy cache directory %s does not exist.", cache_dir_path);
+ return 0;
+ }
+ log_warning_errno(errno, "Failed to create a new AppArmor policy cache, ignoring: %m");
+ return 0;
+ }
+ r = aa_policy_cache_replace_all(policy_cache, NULL);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to load the profiles from the early AppArmor policy cache directory %s, ignoring: %m", cache_dir_path);
+ return 0;
+ }
+
+ log_info("Successfully loaded all binary profiles from AppArmor early policy cache at %s.", cache_dir_path);
+
+ r = aa_change_profile("systemd");
+ if (r < 0) {
+ if (errno == ENOENT)
+ log_debug_errno(errno, "Failed to change to AppArmor profile 'systemd'. Please ensure that one of the binary profile files in policy cache directory %s contains a profile with that name.", cache_dir_path);
+ else
+ log_error_errno(errno, "Failed to change to AppArmor profile 'systemd': %m");
+ return 0;
+ }
+
+ log_info("Changed to AppArmor profile systemd.");
+#endif
+ return 0;
+}
diff --git a/src/core/apparmor-setup.h b/src/core/apparmor-setup.h
new file mode 100644
index 0000000..f3b7382
--- /dev/null
+++ b/src/core/apparmor-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int mac_apparmor_setup(void);
diff --git a/src/core/audit-fd.c b/src/core/audit-fd.c
new file mode 100644
index 0000000..6674fa8
--- /dev/null
+++ b/src/core/audit-fd.c
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "audit-fd.h"
+
+#if HAVE_AUDIT
+
+#include <libaudit.h>
+#include <stdbool.h>
+
+#include "capability-util.h"
+#include "fd-util.h"
+#include "log.h"
+
+static bool initialized = false;
+static int audit_fd;
+
+int get_audit_fd(void) {
+
+ if (!initialized) {
+ if (have_effective_cap(CAP_AUDIT_WRITE) <= 0) {
+ audit_fd = -EPERM;
+ initialized = true;
+
+ return audit_fd;
+ }
+
+ audit_fd = audit_open();
+
+ if (audit_fd < 0) {
+ if (!IN_SET(errno, EAFNOSUPPORT, EPROTONOSUPPORT))
+ log_error_errno(errno, "Failed to connect to audit log: %m");
+
+ audit_fd = errno ? -errno : -EINVAL;
+ }
+
+ initialized = true;
+ }
+
+ return audit_fd;
+}
+
+void close_audit_fd(void) {
+
+ if (initialized && audit_fd >= 0)
+ safe_close(audit_fd);
+
+ initialized = true;
+ audit_fd = -ECONNRESET;
+}
+
+#else
+
+int get_audit_fd(void) {
+ return -EAFNOSUPPORT;
+}
+
+void close_audit_fd(void) {
+}
+
+#endif
diff --git a/src/core/audit-fd.h b/src/core/audit-fd.h
new file mode 100644
index 0000000..5cdf61e
--- /dev/null
+++ b/src/core/audit-fd.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int get_audit_fd(void);
+void close_audit_fd(void);
diff --git a/src/core/automount.c b/src/core/automount.c
new file mode 100644
index 0000000..14bf7e6
--- /dev/null
+++ b/src/core/automount.c
@@ -0,0 +1,1149 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/auto_dev-ioctl.h>
+#include <linux/auto_fs4.h>
+#include <sys/epoll.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "async.h"
+#include "automount.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_AUTOMOUNT_STATE_MAX] = {
+ [AUTOMOUNT_DEAD] = UNIT_INACTIVE,
+ [AUTOMOUNT_WAITING] = UNIT_ACTIVE,
+ [AUTOMOUNT_RUNNING] = UNIT_ACTIVE,
+ [AUTOMOUNT_FAILED] = UNIT_FAILED
+};
+
+static int open_dev_autofs(Manager *m);
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata);
+static int automount_start_expire(Automount *a);
+static void automount_stop_expire(Automount *a);
+static int automount_send_ready(Automount *a, Set *tokens, int status);
+
+static void automount_init(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ a->pipe_fd = -EBADF;
+ a->directory_mode = 0755;
+ UNIT(a)->ignore_on_isolate = true;
+}
+
+static void unmount_autofs(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (a->pipe_fd < 0)
+ return;
+
+ a->pipe_event_source = sd_event_source_disable_unref(a->pipe_event_source);
+ a->pipe_fd = safe_close(a->pipe_fd);
+
+ /* If we reload/reexecute things we keep the mount point around */
+ if (!IN_SET(UNIT(a)->manager->objective, MANAGER_RELOAD, MANAGER_REEXECUTE)) {
+
+ automount_send_ready(a, a->tokens, -EHOSTDOWN);
+ automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+
+ if (a->where) {
+ r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ log_unit_error_errno(UNIT(a), r, "Failed to unmount: %m");
+ }
+ }
+}
+
+static void automount_done(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ unmount_autofs(a);
+
+ a->where = mfree(a->where);
+ a->extra_options = mfree(a->extra_options);
+
+ a->tokens = set_free(a->tokens);
+ a->expire_tokens = set_free(a->expire_tokens);
+
+ a->expire_event_source = sd_event_source_disable_unref(a->expire_event_source);
+}
+
+static int automount_add_trigger_dependencies(Automount *a) {
+ Unit *x;
+ int r;
+
+ assert(a);
+
+ r = unit_load_related_unit(UNIT(a), ".mount", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(a), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_mount_dependencies(Automount *a) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(a);
+
+ r = path_extract_directory(a->where, &parent);
+ if (r < 0)
+ return r;
+
+ return unit_require_mounts_for(UNIT(a), parent, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int automount_add_default_dependencies(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (!UNIT(a)->default_dependencies)
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(UNIT(a)->manager))
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(a), UNIT_BEFORE, SPECIAL_LOCAL_FS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ r = unit_add_dependency_by_name(UNIT(a), UNIT_AFTER, SPECIAL_LOCAL_FS_PRE_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(UNIT(a), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int automount_verify(Automount *a) {
+ static const char *const reserved_options[] = {
+ "fd\0",
+ "pgrp\0",
+ "minproto\0",
+ "maxproto\0",
+ "direct\0",
+ "indirect\0",
+ };
+
+ _cleanup_free_ char *e = NULL;
+ int r;
+
+ assert(a);
+ assert(UNIT(a)->load_state == UNIT_LOADED);
+
+ if (path_equal(a->where, "/"))
+ return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Cannot have an automount unit for the root directory. Refusing.");
+
+ r = unit_name_from_path(a->where, ".automount", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(a), r, "Failed to generate unit name from path: %m");
+
+ if (!unit_has_name(UNIT(a), e))
+ return log_unit_error_errno(UNIT(a), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+ for (size_t i = 0; i < ELEMENTSOF(reserved_options); i++)
+ if (fstab_test_option(a->extra_options, reserved_options[i]))
+ return log_unit_error_errno(
+ UNIT(a),
+ SYNTHETIC_ERRNO(ENOEXEC),
+ "ExtraOptions= setting may not contain reserved option %s.",
+ reserved_options[i]);
+
+ return 0;
+}
+
+static int automount_set_where(Automount *a) {
+ int r;
+
+ assert(a);
+
+ if (a->where)
+ return 0;
+
+ r = unit_name_to_path(UNIT(a)->id, &a->where);
+ if (r < 0)
+ return r;
+
+ path_simplify(a->where);
+ return 1;
+}
+
+static int automount_add_extras(Automount *a) {
+ int r;
+
+ r = automount_set_where(a);
+ if (r < 0)
+ return r;
+
+ r = automount_add_trigger_dependencies(a);
+ if (r < 0)
+ return r;
+
+ r = automount_add_mount_dependencies(a);
+ if (r < 0)
+ return r;
+
+ return automount_add_default_dependencies(a);
+}
+
+static int automount_load(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ /* Load a .automount file */
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = automount_add_extras(a);
+ if (r < 0)
+ return r;
+
+ return automount_verify(a);
+}
+
+static void automount_set_state(Automount *a, AutomountState state) {
+ AutomountState old_state;
+ assert(a);
+
+ if (a->state != state)
+ bus_unit_send_pending_change_signal(UNIT(a), false);
+
+ old_state = a->state;
+ a->state = state;
+
+ if (state != AUTOMOUNT_RUNNING)
+ automount_stop_expire(a);
+
+ if (!IN_SET(state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+ unmount_autofs(a);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(a), "Changed %s -> %s", automount_state_to_string(old_state), automount_state_to_string(state));
+
+ unit_notify(UNIT(a), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int automount_coldplug(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(a->state == AUTOMOUNT_DEAD);
+
+ if (a->deserialized_state == a->state)
+ return 0;
+
+ if (IN_SET(a->deserialized_state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING)) {
+
+ r = automount_set_where(a);
+ if (r < 0)
+ return r;
+
+ r = open_dev_autofs(u->manager);
+ if (r < 0)
+ return r;
+
+ assert(a->pipe_fd >= 0);
+
+ r = sd_event_add_io(u->manager->event, &a->pipe_event_source, a->pipe_fd, EPOLLIN, automount_dispatch_io, u);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+ if (a->deserialized_state == AUTOMOUNT_RUNNING) {
+ r = automount_start_expire(a);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+ }
+
+ automount_set_state(a, a->deserialized_state);
+ }
+
+ return 0;
+}
+
+static void automount_dump(Unit *u, FILE *f, const char *prefix) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ fprintf(f,
+ "%sAutomount State: %s\n"
+ "%sResult: %s\n"
+ "%sWhere: %s\n"
+ "%sExtraOptions: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sTimeoutIdleUSec: %s\n",
+ prefix, automount_state_to_string(a->state),
+ prefix, automount_result_to_string(a->result),
+ prefix, a->where,
+ prefix, a->extra_options,
+ prefix, a->directory_mode,
+ prefix, FORMAT_TIMESPAN(a->timeout_idle_usec, USEC_PER_SEC));
+}
+
+static void automount_enter_dead(Automount *a, AutomountResult f) {
+ assert(a);
+
+ if (a->result == AUTOMOUNT_SUCCESS)
+ a->result = f;
+
+ unit_log_result(UNIT(a), a->result == AUTOMOUNT_SUCCESS, automount_result_to_string(a->result));
+ automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD);
+}
+
+static int open_dev_autofs(Manager *m) {
+ struct autofs_dev_ioctl param;
+ int r;
+
+ assert(m);
+
+ if (m->dev_autofs_fd >= 0)
+ return m->dev_autofs_fd;
+
+ (void) label_fix("/dev/autofs", 0);
+
+ m->dev_autofs_fd = open("/dev/autofs", O_CLOEXEC|O_RDONLY);
+ if (m->dev_autofs_fd < 0)
+ return log_error_errno(errno, "Failed to open /dev/autofs: %m");
+
+ init_autofs_dev_ioctl(&param);
+ r = RET_NERRNO(ioctl(m->dev_autofs_fd, AUTOFS_DEV_IOCTL_VERSION, &param));
+ if (r < 0) {
+ m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+ return log_error_errno(r, "Failed to issue AUTOFS_DEV_IOCTL_VERSION ioctl: %m");
+ }
+
+ log_debug("Autofs kernel version %u.%u", param.ver_major, param.ver_minor);
+
+ return m->dev_autofs_fd;
+}
+
+static int open_ioctl_fd(int dev_autofs_fd, const char *where, dev_t devid) {
+ struct autofs_dev_ioctl *param;
+ size_t l;
+
+ assert(dev_autofs_fd >= 0);
+ assert(where);
+
+ l = sizeof(struct autofs_dev_ioctl) + strlen(where) + 1;
+ param = alloca_safe(l);
+
+ init_autofs_dev_ioctl(param);
+ param->size = l;
+ param->ioctlfd = -EBADF;
+ param->openmount.devid = devid;
+ strcpy(param->path, where);
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0)
+ return -errno;
+
+ if (param->ioctlfd < 0)
+ return -EIO;
+
+ (void) fd_cloexec(param->ioctlfd, true);
+ return param->ioctlfd;
+}
+
+static int autofs_protocol(int dev_autofs_fd, int ioctl_fd) {
+ uint32_t major, minor;
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOVER, &param) < 0)
+ return -errno;
+
+ major = param.protover.version;
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_PROTOSUBVER, &param) < 0)
+ return -errno;
+
+ minor = param.protosubver.sub_version;
+
+ log_debug("Autofs protocol version %u.%u", major, minor);
+ return 0;
+}
+
+static int autofs_set_timeout(int dev_autofs_fd, int ioctl_fd, usec_t usec) {
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (usec == USEC_INFINITY)
+ param.timeout.timeout = 0;
+ else
+ /* Convert to seconds, rounding up. */
+ param.timeout.timeout = DIV_ROUND_UP(usec, USEC_PER_SEC);
+
+ return RET_NERRNO(ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_TIMEOUT, &param));
+}
+
+static int autofs_send_ready(int dev_autofs_fd, int ioctl_fd, uint32_t token, int status) {
+ struct autofs_dev_ioctl param;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (status != 0) {
+ param.fail.token = token;
+ param.fail.status = status;
+ } else
+ param.ready.token = token;
+
+ return RET_NERRNO(ioctl(dev_autofs_fd, status ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, &param));
+}
+
+static int automount_send_ready(Automount *a, Set *tokens, int status) {
+ _cleanup_close_ int ioctl_fd = -EBADF;
+ unsigned token;
+ int r;
+
+ assert(a);
+ assert(status <= 0);
+
+ if (set_isempty(tokens))
+ return 0;
+
+ ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+ if (ioctl_fd < 0)
+ return ioctl_fd;
+
+ if (status != 0)
+ log_unit_debug_errno(UNIT(a), status, "Sending failure: %m");
+ else
+ log_unit_debug(UNIT(a), "Sending success.");
+
+ r = 0;
+
+ /* Autofs thankfully does not hand out 0 as a token */
+ while ((token = PTR_TO_UINT(set_steal_first(tokens)))) {
+ int k;
+
+ /* Autofs fun fact:
+ *
+ * if you pass a positive status code here, kernels
+ * prior to 4.12 will freeze! Yay! */
+
+ k = autofs_send_ready(UNIT(a)->manager->dev_autofs_fd,
+ ioctl_fd,
+ token,
+ status);
+ if (k < 0)
+ r = k;
+ }
+
+ return r;
+}
+
+static void automount_trigger_notify(Unit *u, Unit *other) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+ assert(other->type == UNIT_MOUNT);
+
+ /* Don't propagate state changes from the mount if we are already down */
+ if (!IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING))
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ /* The mount is successfully established */
+ if (IN_SET(MOUNT(other)->state, MOUNT_MOUNTED, MOUNT_REMOUNTING)) {
+ (void) automount_send_ready(a, a->tokens, 0);
+
+ r = automount_start_expire(a);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(a), r, "Failed to start expiration timer, ignoring: %m");
+
+ automount_set_state(a, AUTOMOUNT_RUNNING);
+ }
+
+ if (IN_SET(MOUNT(other)->state,
+ MOUNT_MOUNTING, MOUNT_MOUNTING_DONE,
+ MOUNT_MOUNTED, MOUNT_REMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_FAILED))
+ (void) automount_send_ready(a, a->expire_tokens, -ENODEV);
+
+ if (MOUNT(other)->state == MOUNT_DEAD)
+ (void) automount_send_ready(a, a->expire_tokens, 0);
+
+ /* The mount is in some unhappy state now, let's unfreeze any waiting clients */
+ if (IN_SET(MOUNT(other)->state,
+ MOUNT_DEAD, MOUNT_UNMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING_SIGTERM, MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_FAILED)) {
+
+ (void) automount_send_ready(a, a->tokens, -ENODEV);
+
+ automount_set_state(a, AUTOMOUNT_WAITING);
+ }
+}
+
+static void automount_enter_waiting(Automount *a) {
+ _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR;
+ _cleanup_close_ int ioctl_fd = -EBADF;
+ char name[STRLEN("systemd-") + DECIMAL_STR_MAX(pid_t) + 1];
+ _cleanup_free_ char *options = NULL;
+ bool mounted = false;
+ int r, dev_autofs_fd;
+ struct stat st;
+
+ assert(a);
+ assert(a->pipe_fd < 0);
+ assert(a->where);
+
+ set_clear(a->tokens);
+
+ r = unit_fail_if_noncanonical(UNIT(a), a->where);
+ if (r < 0)
+ goto fail;
+
+ (void) mkdir_p_label(a->where, a->directory_mode);
+
+ unit_warn_if_dir_nonempty(UNIT(a), a->where);
+
+ dev_autofs_fd = open_dev_autofs(UNIT(a)->manager);
+ if (dev_autofs_fd < 0)
+ goto fail;
+
+ if (pipe2(pipe_fd, O_CLOEXEC) < 0) {
+ log_unit_warning_errno(UNIT(a), errno, "Failed to allocate autofs pipe: %m");
+ goto fail;
+ }
+ r = fd_nonblock(pipe_fd[0], true);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(a), r, "Failed to make read side of pipe non-blocking: %m");
+ goto fail;
+ }
+
+ if (asprintf(
+ &options,
+ "fd=%i,pgrp="PID_FMT",minproto=5,maxproto=5,direct%s%s",
+ pipe_fd[1],
+ getpgrp(),
+ isempty(a->extra_options) ? "" : ",",
+ strempty(a->extra_options)) < 0) {
+ log_oom();
+ goto fail;
+ }
+
+ xsprintf(name, "systemd-"PID_FMT, getpid_cached());
+ r = mount_nofollow_verbose(LOG_WARNING, name, a->where, "autofs", 0, options);
+ if (r < 0)
+ goto fail;
+
+ mounted = true;
+
+ pipe_fd[1] = safe_close(pipe_fd[1]);
+
+ if (stat(a->where, &st) < 0) {
+ log_unit_warning_errno(UNIT(a), errno, "Failed to stat new automount point '%s': %m", a->where);
+ goto fail;
+ }
+
+ ioctl_fd = open_ioctl_fd(dev_autofs_fd, a->where, st.st_dev);
+ if (ioctl_fd < 0) {
+ log_unit_warning_errno(UNIT(a), ioctl_fd, "Failed to open automount ioctl fd for '%s': %m", a->where);
+ goto fail;
+ }
+
+ r = autofs_protocol(dev_autofs_fd, ioctl_fd);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(a), r, "Failed to validate autofs protocol for '%s': %m", a->where);
+ goto fail;
+ }
+
+ r = autofs_set_timeout(dev_autofs_fd, ioctl_fd, a->timeout_idle_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(a), r, "Failed to set autofs timeout for '%s': %m", a->where);
+ goto fail;
+ }
+
+ r = sd_event_add_io(UNIT(a)->manager->event, &a->pipe_event_source, pipe_fd[0], EPOLLIN, automount_dispatch_io, a);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(a), r, "Failed to allocate IO event source for autofs mount '%s': %m", a->where);
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(a->pipe_event_source, "automount-io");
+
+ a->pipe_fd = TAKE_FD(pipe_fd[0]);
+ a->dev_id = st.st_dev;
+
+ automount_set_state(a, AUTOMOUNT_WAITING);
+ return;
+
+fail:
+ if (mounted) {
+ r = repeat_unmount(a->where, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(a), r, "Failed to unmount, ignoring: %m");
+ }
+
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static int asynchronous_expire(int dev_autofs_fd, int ioctl_fd) {
+ int r;
+
+ assert(dev_autofs_fd >= 0);
+ assert(ioctl_fd >= 0);
+
+ /* Issue AUTOFS_DEV_IOCTL_EXPIRE in subprocess, asynchronously. Note that we don't keep track of the
+ * child's PID, we are PID1/autoreaper after all, hence when it dies we'll automatically clean it up
+ * anyway. */
+
+ r = safe_fork_full("(sd-expire)",
+ /* stdio_fds= */ NULL,
+ (int[]) { dev_autofs_fd, ioctl_fd },
+ /* n_except_fds= */ 2,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG,
+ /* pid= */ NULL);
+ if (r != 0)
+ return r;
+
+ /* Child */
+ for (;;) {
+ struct autofs_dev_ioctl param;
+ init_autofs_dev_ioctl(&param);
+ param.ioctlfd = ioctl_fd;
+
+ if (ioctl(dev_autofs_fd, AUTOFS_DEV_IOCTL_EXPIRE, &param) < 0)
+ break;
+ }
+
+ if (errno != EAGAIN)
+ log_warning_errno(errno, "Failed to expire automount, ignoring: %m");
+
+ _exit(EXIT_SUCCESS);
+}
+
+static int automount_dispatch_expire(sd_event_source *source, usec_t usec, void *userdata) {
+ _cleanup_close_ int ioctl_fd = -EBADF;
+ Automount *a = AUTOMOUNT(userdata);
+ int r;
+
+ assert(a);
+ assert(source == a->expire_event_source);
+
+ ioctl_fd = open_ioctl_fd(UNIT(a)->manager->dev_autofs_fd, a->where, a->dev_id);
+ if (ioctl_fd < 0)
+ return log_unit_error_errno(UNIT(a), ioctl_fd, "Couldn't open autofs ioctl fd: %m");
+
+ r = asynchronous_expire(UNIT(a)->manager->dev_autofs_fd, ioctl_fd);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(a), r, "Failed to start expire job: %m");
+
+ return automount_start_expire(a);
+}
+
+static int automount_start_expire(Automount *a) {
+ usec_t timeout;
+ int r;
+
+ assert(a);
+
+ if (a->timeout_idle_usec == 0)
+ return 0;
+
+ timeout = MAX(a->timeout_idle_usec/3, USEC_PER_SEC);
+
+ if (a->expire_event_source) {
+ r = sd_event_source_set_time_relative(a->expire_event_source, timeout);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_ONESHOT);
+ }
+
+ r = sd_event_add_time_relative(
+ UNIT(a)->manager->event,
+ &a->expire_event_source,
+ CLOCK_MONOTONIC, timeout, 0,
+ automount_dispatch_expire, a);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(a->expire_event_source, "automount-expire");
+
+ return 0;
+}
+
+static void automount_stop_expire(Automount *a) {
+ assert(a);
+
+ if (!a->expire_event_source)
+ return;
+
+ (void) sd_event_source_set_enabled(a->expire_event_source, SD_EVENT_OFF);
+}
+
+static void automount_enter_running(Automount *a) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ struct stat st;
+ int r;
+
+ assert(a);
+
+ /* If the user masked our unit in the meantime, fail */
+ if (UNIT(a)->load_state != UNIT_LOADED) {
+ log_unit_error(UNIT(a), "Suppressing automount event since unit is no longer loaded.");
+ goto fail;
+ }
+
+ /* We don't take mount requests anymore if we are supposed to
+ * shut down anyway */
+ if (unit_stop_pending(UNIT(a))) {
+ log_unit_debug(UNIT(a), "Suppressing automount request since unit stop is scheduled.");
+ automount_send_ready(a, a->tokens, -EHOSTDOWN);
+ automount_send_ready(a, a->expire_tokens, -EHOSTDOWN);
+ return;
+ }
+
+ (void) mkdir_p_label(a->where, a->directory_mode);
+
+ /* Before we do anything, let's see if somebody is playing games with us? */
+ if (lstat(a->where, &st) < 0) {
+ log_unit_warning_errno(UNIT(a), errno, "Failed to stat automount point: %m");
+ goto fail;
+ }
+
+ /* The mount unit may have been explicitly started before we got the
+ * autofs request. Ack it to unblock anything waiting on the mount point. */
+ if (!S_ISDIR(st.st_mode) || st.st_dev != a->dev_id) {
+ log_unit_info(UNIT(a), "Automount point already active?");
+ automount_send_ready(a, a->tokens, 0);
+ return;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(a));
+ if (!trigger) {
+ log_unit_error(UNIT(a), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(a)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ log_unit_warning(UNIT(a), "Failed to queue mount startup job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+
+ automount_set_state(a, AUTOMOUNT_RUNNING);
+ return;
+
+fail:
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+}
+
+static int automount_start(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(IN_SET(a->state, AUTOMOUNT_DEAD, AUTOMOUNT_FAILED));
+
+ if (path_is_mount_point(a->where, NULL, 0) > 0)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EEXIST), "Path %s is already a mount point, refusing start.", a->where);
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ a->result = AUTOMOUNT_SUCCESS;
+ automount_enter_waiting(a);
+ return 1;
+}
+
+static int automount_stop(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(IN_SET(a->state, AUTOMOUNT_WAITING, AUTOMOUNT_RUNNING));
+
+ automount_enter_dead(a, AUTOMOUNT_SUCCESS);
+ return 1;
+}
+
+static int automount_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Automount *a = AUTOMOUNT(u);
+ void *p;
+ int r;
+
+ assert(a);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", automount_state_to_string(a->state));
+ (void) serialize_item(f, "result", automount_result_to_string(a->result));
+ (void) serialize_item_format(f, "dev-id", "%lu", (unsigned long) a->dev_id);
+
+ SET_FOREACH(p, a->tokens)
+ (void) serialize_item_format(f, "token", "%u", PTR_TO_UINT(p));
+ SET_FOREACH(p, a->expire_tokens)
+ (void) serialize_item_format(f, "expire-token", "%u", PTR_TO_UINT(p));
+
+ r = serialize_fd(f, fds, "pipe-fd", a->pipe_fd);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int automount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ AutomountState state;
+
+ state = automount_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ a->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ AutomountResult f;
+
+ f = automount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != AUTOMOUNT_SUCCESS)
+ a->result = f;
+
+ } else if (streq(key, "dev-id")) {
+ unsigned long d;
+
+ if (safe_atolu(value, &d) < 0)
+ log_unit_debug(u, "Failed to parse dev-id value: %s", value);
+ else
+ a->dev_id = (dev_t) d;
+
+ } else if (streq(key, "token")) {
+ unsigned token;
+
+ if (safe_atou(value, &token) < 0)
+ log_unit_debug(u, "Failed to parse token value: %s", value);
+ else {
+ r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(token));
+ if (r < 0)
+ log_unit_error_errno(u, r, "Failed to add token to set: %m");
+ }
+ } else if (streq(key, "expire-token")) {
+ unsigned token;
+
+ if (safe_atou(value, &token) < 0)
+ log_unit_debug(u, "Failed to parse token value: %s", value);
+ else {
+ r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(token));
+ if (r < 0)
+ log_unit_error_errno(u, r, "Failed to add expire token to set: %m");
+ }
+ } else if (streq(key, "pipe-fd")) {
+ safe_close(a->pipe_fd);
+ a->pipe_fd = deserialize_fd(fds, value);
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState automount_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[AUTOMOUNT(u)->state];
+}
+
+static const char *automount_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return automount_state_to_string(AUTOMOUNT(u)->state);
+}
+
+static bool automount_may_gc(Unit *u) {
+ Unit *t;
+
+ assert(u);
+
+ t = UNIT_TRIGGER(u);
+ if (!t)
+ return true;
+
+ return UNIT_VTABLE(t)->may_gc(t);
+}
+
+static int automount_dispatch_io(sd_event_source *s, int fd, uint32_t events, void *userdata) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ union autofs_v5_packet_union packet;
+ Automount *a = AUTOMOUNT(userdata);
+ Unit *trigger;
+ int r;
+
+ assert(a);
+ assert(fd == a->pipe_fd);
+
+ if (events & (EPOLLHUP|EPOLLERR)) {
+ log_unit_error(UNIT(a), "Got hangup/error on autofs pipe from kernel. Likely our automount point has been unmounted by someone or something else?");
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_UNMOUNTED);
+ return 0;
+ }
+
+ if (events != EPOLLIN) {
+ log_unit_error(UNIT(a), "Got invalid poll event %"PRIu32" on pipe (fd=%d)", events, fd);
+ goto fail;
+ }
+
+ r = loop_read_exact(a->pipe_fd, &packet, sizeof(packet), true);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Invalid read from pipe: %m");
+ goto fail;
+ }
+
+ switch (packet.hdr.type) {
+
+ case autofs_ptype_missing_direct:
+
+ if (packet.v5_packet.pid > 0) {
+ _cleanup_free_ char *p = NULL;
+
+ (void) pid_get_comm(packet.v5_packet.pid, &p);
+ log_unit_info(UNIT(a), "Got automount request for %s, triggered by %"PRIu32" (%s)", a->where, packet.v5_packet.pid, strna(p));
+ } else
+ log_unit_debug(UNIT(a), "Got direct mount request on %s", a->where);
+
+ r = set_ensure_put(&a->tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+ goto fail;
+ }
+
+ automount_enter_running(a);
+ break;
+
+ case autofs_ptype_expire_direct:
+ log_unit_debug(UNIT(a), "Got direct umount request on %s", a->where);
+
+ automount_stop_expire(a);
+
+ r = set_ensure_put(&a->expire_tokens, NULL, UINT_TO_PTR(packet.v5_packet.wait_queue_token));
+ if (r < 0) {
+ log_unit_error_errno(UNIT(a), r, "Failed to remember token: %m");
+ goto fail;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(a));
+ if (!trigger) {
+ log_unit_error(UNIT(a), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(a)->manager, JOB_STOP, trigger, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ log_unit_warning(UNIT(a), "Failed to queue unmount job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+ break;
+
+ default:
+ log_unit_error(UNIT(a), "Received unknown automount request %i", packet.hdr.type);
+ break;
+ }
+
+ return 0;
+
+fail:
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void automount_shutdown(Manager *m) {
+ assert(m);
+
+ m->dev_autofs_fd = safe_close(m->dev_autofs_fd);
+}
+
+static void automount_reset_failed(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+
+ if (a->state == AUTOMOUNT_FAILED)
+ automount_set_state(a, AUTOMOUNT_DEAD);
+
+ a->result = AUTOMOUNT_SUCCESS;
+}
+
+static bool automount_supported(void) {
+ static int supported = -1;
+
+ if (supported < 0)
+ supported = access("/dev/autofs", F_OK) >= 0;
+
+ return supported;
+}
+
+static int automount_can_start(Unit *u) {
+ Automount *a = AUTOMOUNT(u);
+ int r;
+
+ assert(a);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ automount_enter_dead(a, AUTOMOUNT_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const automount_result_table[_AUTOMOUNT_RESULT_MAX] = {
+ [AUTOMOUNT_SUCCESS] = "success",
+ [AUTOMOUNT_FAILURE_RESOURCES] = "resources",
+ [AUTOMOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT] = "mount-start-limit-hit",
+ [AUTOMOUNT_FAILURE_UNMOUNTED] = "unmounted",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(automount_result, AutomountResult);
+
+const UnitVTable automount_vtable = {
+ .object_size = sizeof(Automount),
+
+ .sections =
+ "Unit\0"
+ "Automount\0"
+ "Install\0",
+ .private_section = "Automount",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+ .exclude_from_switch_root_serialization = true,
+
+ .init = automount_init,
+ .load = automount_load,
+ .done = automount_done,
+
+ .coldplug = automount_coldplug,
+
+ .dump = automount_dump,
+
+ .start = automount_start,
+ .stop = automount_stop,
+
+ .serialize = automount_serialize,
+ .deserialize_item = automount_deserialize_item,
+
+ .active_state = automount_active_state,
+ .sub_state_to_string = automount_sub_state_to_string,
+
+ .may_gc = automount_may_gc,
+
+ .trigger_notify = automount_trigger_notify,
+
+ .reset_failed = automount_reset_failed,
+
+ .bus_set_property = bus_automount_set_property,
+
+ .shutdown = automount_shutdown,
+ .supported = automount_supported,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Set up automount %s.",
+ [JOB_FAILED] = "Failed to set up automount %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Unset automount %s.",
+ [JOB_FAILED] = "Failed to unset automount %s.",
+ },
+ },
+
+ .can_start = automount_can_start,
+};
diff --git a/src/core/automount.h b/src/core/automount.h
new file mode 100644
index 0000000..e413f23
--- /dev/null
+++ b/src/core/automount.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Automount Automount;
+
+#include "unit.h"
+
+typedef enum AutomountResult {
+ AUTOMOUNT_SUCCESS,
+ AUTOMOUNT_FAILURE_RESOURCES,
+ AUTOMOUNT_FAILURE_UNMOUNTED,
+ AUTOMOUNT_FAILURE_START_LIMIT_HIT,
+ AUTOMOUNT_FAILURE_MOUNT_START_LIMIT_HIT,
+ _AUTOMOUNT_RESULT_MAX,
+ _AUTOMOUNT_RESULT_INVALID = -EINVAL,
+} AutomountResult;
+
+struct Automount {
+ Unit meta;
+
+ AutomountState state, deserialized_state;
+
+ char *where;
+ char *extra_options;
+ usec_t timeout_idle_usec;
+
+ int pipe_fd;
+ sd_event_source *pipe_event_source;
+ mode_t directory_mode;
+ dev_t dev_id;
+
+ Set *tokens;
+ Set *expire_tokens;
+
+ sd_event_source *expire_event_source;
+
+ AutomountResult result;
+};
+
+extern const UnitVTable automount_vtable;
+
+const char* automount_result_to_string(AutomountResult i) _const_;
+AutomountResult automount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(AUTOMOUNT, Automount);
diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c
new file mode 100644
index 0000000..06d2146
--- /dev/null
+++ b/src/core/bpf-devices.c
@@ -0,0 +1,505 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fnmatch.h>
+#include <linux/bpf_insn.h>
+
+#include "bpf-devices.h"
+#include "bpf-program.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+#define PASS_JUMP_OFF 4096
+
+/* Ensure the high level flags we use and the low-level BPF flags exposed on the kernel are defined the same way */
+assert_cc((unsigned) BPF_DEVCG_ACC_MKNOD == (unsigned) CGROUP_DEVICE_MKNOD);
+assert_cc((unsigned) BPF_DEVCG_ACC_READ == (unsigned) CGROUP_DEVICE_READ);
+assert_cc((unsigned) BPF_DEVCG_ACC_WRITE == (unsigned) CGROUP_DEVICE_WRITE);
+
+static int bpf_prog_allow_list_device(
+ BPFProgram *prog,
+ char type,
+ int major,
+ int minor,
+ CGroupDevicePermissions p) {
+
+ int r;
+
+ assert(prog);
+
+ log_trace("%s: %c %d:%d %s", __func__, type, major, minor, cgroup_device_permissions_to_string(p));
+
+ if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3), /* compare device type */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+static int bpf_prog_allow_list_major(
+ BPFProgram *prog,
+ char type,
+ int major,
+ CGroupDevicePermissions p) {
+
+ int r;
+
+ assert(prog);
+
+ log_trace("%s: %c %d:* %s", __func__, type, major, cgroup_device_permissions_to_string(p));
+
+ if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2), /* compare device type */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+static int bpf_prog_allow_list_class(
+ BPFProgram *prog,
+ char type,
+ CGroupDevicePermissions p) {
+
+ int r;
+
+ assert(prog);
+
+ log_trace("%s: %c *:* %s", __func__, type, cgroup_device_permissions_to_string(p));
+
+ if (p <= 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+ return -EINVAL;
+
+ assert(IN_SET(type, 'b', 'c'));
+ const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK;
+
+ const struct bpf_insn insn[] = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_1, p),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */
+
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */
+ BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
+ };
+
+ if (p == _CGROUP_DEVICE_PERMISSIONS_ALL)
+ r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3);
+ else
+ r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ return r;
+}
+
+int bpf_devices_cgroup_init(
+ BPFProgram **ret,
+ CGroupDevicePolicy policy,
+ bool allow_list) {
+
+ const struct bpf_insn pre_insn[] = {
+ /* load device type to r2 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+ /* load access type to r3 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+ /* load major number to r4 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+ /* load minor number to r5 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct bpf_cgroup_dev_ctx, minor)),
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ int r;
+
+ assert(ret);
+
+ if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list)
+ return 0;
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog);
+ if (r < 0)
+ return log_error_errno(r, "Loading device control BPF program failed: %m");
+
+ if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) {
+ r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+ }
+
+ *ret = TAKE_PTR(prog);
+
+ return 0;
+}
+
+int bpf_devices_apply_policy(
+ BPFProgram **prog,
+ CGroupDevicePolicy policy,
+ bool allow_list,
+ const char *cgroup_path,
+ BPFProgram **prog_installed) {
+
+ _cleanup_free_ char *controller_path = NULL;
+ int r;
+
+ /* This will assign *prog_installed if everything goes well. */
+
+ assert(prog);
+ if (!*prog)
+ goto finish;
+
+ const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list;
+
+ const struct bpf_insn post_insn[] = {
+ /* return DENY */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+ };
+
+ const struct bpf_insn exit_insn[] = {
+ /* finally return DENY if deny_everything else ALLOW */
+ BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1),
+ BPF_EXIT_INSN()
+ };
+
+ if (!deny_everything) {
+ r = bpf_program_add_instructions(*prog, post_insn, ELEMENTSOF(post_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ /* Fixup PASS_JUMP_OFF jump offsets. */
+ for (size_t off = 0; off < (*prog)->n_instructions; off++) {
+ struct bpf_insn *ins = &((*prog)->instructions[off]);
+
+ if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF)
+ ins->off = (*prog)->n_instructions - off - 1;
+ }
+ }
+
+ r = bpf_program_add_instructions(*prog, exit_insn, ELEMENTSOF(exit_insn));
+ if (r < 0)
+ return log_error_errno(r, "Extending device control BPF program failed: %m");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine cgroup path: %m");
+
+ r = bpf_program_cgroup_attach(*prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI);
+ if (r < 0)
+ return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m",
+ empty_to_root(cgroup_path));
+
+ finish:
+ /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */
+ if (prog_installed) {
+ bpf_program_free(*prog_installed);
+ *prog_installed = TAKE_PTR(*prog);
+ }
+ return 0;
+}
+
+int bpf_devices_supported(void) {
+ const struct bpf_insn trivial[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN()
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+ static int supported = -1;
+ int r;
+
+ /* Checks whether BPF device controller is supported. For this, we check five things:
+ *
+ * a) whether we are privileged
+ * b) whether the unified hierarchy is being used
+ * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
+ */
+
+ if (supported >= 0)
+ return supported;
+
+ if (geteuid() != 0) {
+ log_debug("Not enough privileges, BPF device control is not supported.");
+ return supported = 0;
+ }
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+ if (r == 0) {
+ log_debug("Not running with unified cgroups, BPF device control is not supported.");
+ return supported = 0;
+ }
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program);
+ if (r < 0) {
+ log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+ if (r < 0) {
+ log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ r = bpf_program_load_kernel(program, NULL, 0);
+ if (r < 0) {
+ log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
+ return supported = 0;
+ }
+
+ return supported = 1;
+}
+
+static int allow_list_device_pattern(
+ BPFProgram *prog,
+ const char *path,
+ char type,
+ const unsigned *maj,
+ const unsigned *min,
+ CGroupDevicePermissions p) {
+
+ assert(IN_SET(type, 'b', 'c'));
+
+ if (cg_all_unified() > 0) {
+ if (!prog)
+ return 0;
+
+ if (maj && min)
+ return bpf_prog_allow_list_device(prog, type, *maj, *min, p);
+ else if (maj)
+ return bpf_prog_allow_list_major(prog, type, *maj, p);
+ else
+ return bpf_prog_allow_list_class(prog, type, p);
+
+ } else {
+ char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4];
+ int r;
+
+ if (maj && min)
+ xsprintf(buf, "%c %u:%u %s", type, *maj, *min, cgroup_device_permissions_to_string(p));
+ else if (maj)
+ xsprintf(buf, "%c %u:* %s", type, *maj, cgroup_device_permissions_to_string(p));
+ else
+ xsprintf(buf, "%c *:* %s", type, cgroup_device_permissions_to_string(p));
+
+ /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+ * EINVAL here. */
+
+ r = cg_set_attribute("devices", path, "devices.allow", buf);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to set devices.allow on %s: %m", path);
+
+ return r;
+ }
+}
+
+int bpf_devices_allow_list_device(
+ BPFProgram *prog,
+ const char *path,
+ const char *node,
+ CGroupDevicePermissions p) {
+
+ mode_t mode;
+ dev_t rdev;
+ int r;
+
+ assert(path);
+ assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+ log_trace("%s: %s %s", __func__, node, cgroup_device_permissions_to_string(p));
+
+ /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
+ * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
+ * means clients can use these path without the device node actually around */
+ r = device_path_parse_major_minor(node, &mode, &rdev);
+ if (r < 0) {
+ if (r != -ENODEV)
+ return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
+
+ struct stat st;
+ if (stat(node, &st) < 0)
+ return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
+
+ if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
+ return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node);
+
+ mode = st.st_mode;
+ rdev = (dev_t) st.st_rdev;
+ }
+
+ unsigned maj = major(rdev), min = minor(rdev);
+ return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, p);
+}
+
+int bpf_devices_allow_list_major(
+ BPFProgram *prog,
+ const char *path,
+ const char *name,
+ char type,
+ CGroupDevicePermissions permissions) {
+
+ unsigned maj;
+ int r;
+
+ assert(path);
+ assert(IN_SET(type, 'b', 'c'));
+ assert(permissions >= 0 && permissions < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+ if (streq(name, "*"))
+ /* If the name is a wildcard, then apply this list to all devices of this type */
+ return allow_list_device_pattern(prog, path, type, NULL, NULL, permissions);
+
+ if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj))
+ /* The name is numeric and suitable as major. In that case, let's take its major, and create
+ * the entry directly. */
+ return allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+
+ _cleanup_fclose_ FILE *f = NULL;
+ bool good = false, any = false;
+
+ f = fopen("/proc/devices", "re");
+ if (!f)
+ return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ char *w, *p;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read /proc/devices: %m");
+ if (r == 0)
+ break;
+
+ if (type == 'c' && streq(line, "Character devices:")) {
+ good = true;
+ continue;
+ }
+
+ if (type == 'b' && streq(line, "Block devices:")) {
+ good = true;
+ continue;
+ }
+
+ if (isempty(line)) {
+ good = false;
+ continue;
+ }
+
+ if (!good)
+ continue;
+
+ p = strstrip(line);
+
+ w = strpbrk(p, WHITESPACE);
+ if (!w)
+ continue;
+ *w = 0;
+
+ r = safe_atou(p, &maj);
+ if (r < 0)
+ continue;
+ if (maj <= 0)
+ continue;
+
+ w++;
+ w += strspn(w, WHITESPACE);
+
+ if (fnmatch(name, w, 0) != 0)
+ continue;
+
+ any = true;
+ (void) allow_list_device_pattern(prog, path, type, &maj, NULL, permissions);
+ }
+
+ if (!any)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Device allow list pattern \"%s\" did not match anything.", name);
+
+ return 0;
+}
+
+int bpf_devices_allow_list_static(
+ BPFProgram *prog,
+ const char *path) {
+
+ static const char auto_devices[] =
+ "/dev/null\0" "rwm\0"
+ "/dev/zero\0" "rwm\0"
+ "/dev/full\0" "rwm\0"
+ "/dev/random\0" "rwm\0"
+ "/dev/urandom\0" "rwm\0"
+ "/dev/tty\0" "rwm\0"
+ "/dev/ptmx\0" "rwm\0"
+ /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
+ "/run/systemd/inaccessible/chr\0" "rwm\0"
+ "/run/systemd/inaccessible/blk\0" "rwm\0";
+ int r = 0, k;
+
+ NULSTR_FOREACH_PAIR(node, acc, auto_devices) {
+ k = bpf_devices_allow_list_device(prog, path, node, cgroup_device_permissions_from_string(acc));
+ if (r >= 0 && k < 0)
+ r = k;
+ }
+
+ /* PTS (/dev/pts) devices may not be duplicated, but accessed */
+ k = bpf_devices_allow_list_major(prog, path, "pts", 'c', CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+ if (r >= 0 && k < 0)
+ r = k;
+
+ return r;
+}
diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h
new file mode 100644
index 0000000..5660e1a
--- /dev/null
+++ b/src/core/bpf-devices.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+#include "cgroup.h"
+
+typedef struct BPFProgram BPFProgram;
+
+int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list);
+int bpf_devices_apply_policy(
+ BPFProgram **prog,
+ CGroupDevicePolicy policy,
+ bool allow_list,
+ const char *cgroup_path,
+ BPFProgram **prog_installed);
+
+int bpf_devices_supported(void);
+int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, CGroupDevicePermissions p);
+int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, CGroupDevicePermissions p);
+int bpf_devices_allow_list_static(BPFProgram *prog, const char *path);
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
new file mode 100644
index 0000000..66773e1
--- /dev/null
+++ b/src/core/bpf-firewall.c
@@ -0,0 +1,974 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/bpf_insn.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-program.h"
+#include "fd-util.h"
+#include "in-addr-prefix-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "unit.h"
+#include "strv.h"
+#include "virt.h"
+
+enum {
+ MAP_KEY_PACKETS,
+ MAP_KEY_BYTES,
+};
+
+enum {
+ ACCESS_ALLOWED = 1,
+ ACCESS_DENIED = 2,
+};
+
+/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
+
+static int add_lookup_instructions(
+ BPFProgram *p,
+ int map_fd,
+ int protocol,
+ bool is_ingress,
+ int verdict) {
+
+ int r, addr_offset, addr_size;
+
+ assert(p);
+ assert(map_fd >= 0);
+
+ switch (protocol) {
+
+ case ETH_P_IP:
+ addr_size = sizeof(uint32_t);
+ addr_offset = is_ingress ?
+ offsetof(struct iphdr, saddr) :
+ offsetof(struct iphdr, daddr);
+ break;
+
+ case ETH_P_IPV6:
+ addr_size = 4 * sizeof(uint32_t);
+ addr_offset = is_ingress ?
+ offsetof(struct ip6_hdr, ip6_src.s6_addr) :
+ offsetof(struct ip6_hdr, ip6_dst.s6_addr);
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ do {
+ /* Compare IPv4 with one word instruction (32-bit) */
+ struct bpf_insn insn[] = {
+ /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
+
+ /*
+ * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
+ *
+ * R1: Pointer to the skb
+ * R2: Data offset
+ * R3: Destination buffer on the stack (r10 - 4)
+ * R4: Number of bytes to read (4)
+ */
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV32_IMM(BPF_REG_2, addr_offset),
+
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
+
+ BPF_MOV32_IMM(BPF_REG_4, addr_size),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+
+ /*
+ * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
+ * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
+ * has to be set to the maximum possible value.
+ *
+ * On success, the looked up value is stored in R0. For this application, the actual
+ * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
+ * matching value.
+ */
+
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
+
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+ };
+
+ /* Jump label fixup */
+ insn[0].off = ELEMENTSOF(insn) - 1;
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+
+ } while (false);
+
+ return 0;
+}
+
+static int add_instructions_for_ip_any(
+ BPFProgram *p,
+ int verdict) {
+ int r;
+
+ assert(p);
+
+ const struct bpf_insn insn[] = {
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
+ };
+
+ r = bpf_program_add_instructions(p, insn, 1);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int bpf_firewall_compile_bpf(
+ Unit *u,
+ const char *prog_name,
+ bool is_ingress,
+ BPFProgram **ret,
+ bool ip_allow_any,
+ bool ip_deny_any) {
+
+ const struct bpf_insn pre_insn[] = {
+ /*
+ * When the eBPF program is entered, R1 contains the address of the skb.
+ * However, R1-R5 are scratch registers that are not preserved when calling
+ * into kernel functions, so we need to save anything that's supposed to
+ * stay around to R6-R9. Save the skb to R6.
+ */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /*
+ * Although we cannot access the skb data directly from eBPF programs used in this
+ * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
+ * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
+ * for later use.
+ */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
+
+ /*
+ * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
+ * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
+ */
+ BPF_MOV32_IMM(BPF_REG_8, 0),
+ };
+
+ /*
+ * The access checkers compiled for the configured allowance and denial lists
+ * write to R8 at runtime. The following code prepares for an early exit that
+ * skip the accounting if the packet is denied.
+ *
+ * R0 = 1
+ * if (R8 == ACCESS_DENIED)
+ * R0 = 0
+ *
+ * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
+ * is allowed to pass.
+ */
+ const struct bpf_insn post_insn[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ int accounting_map_fd, r;
+ bool access_enabled;
+
+ assert(u);
+ assert(ret);
+
+ accounting_map_fd = is_ingress ?
+ u->ip_accounting_ingress_map_fd :
+ u->ip_accounting_egress_map_fd;
+
+ access_enabled =
+ u->ipv4_allow_map_fd >= 0 ||
+ u->ipv6_allow_map_fd >= 0 ||
+ u->ipv4_deny_map_fd >= 0 ||
+ u->ipv6_deny_map_fd >= 0 ||
+ ip_allow_any ||
+ ip_deny_any;
+
+ if (accounting_map_fd < 0 && !access_enabled) {
+ *ret = NULL;
+ return 0;
+ }
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
+ if (r < 0)
+ return r;
+
+ r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
+ if (r < 0)
+ return r;
+
+ if (access_enabled) {
+ /*
+ * The simple rule this function translates into eBPF instructions is:
+ *
+ * - Access will be granted when an address matches an entry in @list_allow
+ * - Otherwise, access will be denied when an address matches an entry in @list_deny
+ * - Otherwise, access will be granted
+ */
+
+ if (u->ipv4_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv6_deny_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv4_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (u->ipv6_allow_map_fd >= 0) {
+ r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (ip_allow_any) {
+ r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
+ if (r < 0)
+ return r;
+ }
+
+ if (ip_deny_any) {
+ r = add_instructions_for_ip_any(p, ACCESS_DENIED);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
+ if (r < 0)
+ return r;
+
+ if (accounting_map_fd >= 0) {
+ struct bpf_insn insn[] = {
+ /*
+ * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
+ * The jump label will be fixed up later.
+ */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
+
+ /* Count packets */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Count bytes */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Allow the packet to pass */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ };
+
+ /* Jump label fixup */
+ insn[0].off = ELEMENTSOF(insn) - 1;
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+ }
+
+ do {
+ /*
+ * Exit from the eBPF program, R0 contains the verdict.
+ * 0 means the packet is denied, 1 means the packet may pass.
+ */
+ const struct bpf_insn insn[] = {
+ BPF_EXIT_INSN()
+ };
+
+ r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
+ if (r < 0)
+ return r;
+ } while (false);
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
+ struct in_addr_prefix *a;
+
+ assert(n_ipv4);
+ assert(n_ipv6);
+
+ SET_FOREACH(a, prefixes)
+ switch (a->family) {
+
+ case AF_INET:
+ (*n_ipv4)++;
+ break;
+
+ case AF_INET6:
+ (*n_ipv6)++;
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int bpf_firewall_add_access_items(
+ Set *prefixes,
+ int ipv4_map_fd,
+ int ipv6_map_fd,
+ int verdict) {
+
+ struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
+ struct in_addr_prefix *a;
+ uint64_t value = verdict;
+ int r;
+
+ key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
+ key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
+
+ SET_FOREACH(a, prefixes)
+ switch (a->family) {
+
+ case AF_INET:
+ key_ipv4->prefixlen = a->prefixlen;
+ memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
+
+ r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case AF_INET6:
+ key_ipv6->prefixlen = a->prefixlen;
+ memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
+
+ r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
+ if (r < 0)
+ return r;
+
+ break;
+
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int bpf_firewall_prepare_access_maps(
+ Unit *u,
+ int verdict,
+ int *ret_ipv4_map_fd,
+ int *ret_ipv6_map_fd,
+ bool *ret_has_any) {
+
+ _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
+ size_t n_ipv4 = 0, n_ipv6 = 0;
+ Unit *p;
+ int r;
+
+ assert(ret_ipv4_map_fd);
+ assert(ret_ipv6_map_fd);
+ assert(ret_has_any);
+
+ for (p = u; p; p = UNIT_GET_SLICE(p)) {
+ CGroupContext *cc;
+ Set *prefixes;
+ bool *reduced;
+
+ cc = unit_get_cgroup_context(p);
+ if (!cc)
+ continue;
+
+ prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
+ reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
+
+ if (!*reduced) {
+ r = in_addr_prefixes_reduce(prefixes);
+ if (r < 0)
+ return r;
+
+ *reduced = true;
+ }
+
+ bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
+
+ /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
+ * needing CAP_SYS_ADMIN for allocating LPM trie map. */
+ if (in_addr_prefixes_is_any(prefixes)) {
+ *ret_has_any = true;
+ return 0;
+ }
+ }
+
+ if (n_ipv4 > 0) {
+ char *name = strjoina("4_", u->id);
+ ipv4_map_fd = bpf_map_new(
+ name,
+ BPF_MAP_TYPE_LPM_TRIE,
+ offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
+ sizeof(uint64_t),
+ n_ipv4,
+ BPF_F_NO_PREALLOC);
+ if (ipv4_map_fd < 0)
+ return ipv4_map_fd;
+ }
+
+ if (n_ipv6 > 0) {
+ char *name = strjoina("6_", u->id);
+ ipv6_map_fd = bpf_map_new(
+ name,
+ BPF_MAP_TYPE_LPM_TRIE,
+ offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
+ sizeof(uint64_t),
+ n_ipv6,
+ BPF_F_NO_PREALLOC);
+ if (ipv6_map_fd < 0)
+ return ipv6_map_fd;
+ }
+
+ for (p = u; p; p = UNIT_GET_SLICE(p)) {
+ CGroupContext *cc;
+
+ cc = unit_get_cgroup_context(p);
+ if (!cc)
+ continue;
+
+ r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
+ ipv4_map_fd, ipv6_map_fd, verdict);
+ if (r < 0)
+ return r;
+ }
+
+ *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
+ *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
+ *ret_has_any = false;
+ return 0;
+}
+
+static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
+ int r;
+
+ assert(u);
+ assert(fd_ingress);
+ assert(fd_egress);
+
+ if (enabled) {
+ if (*fd_ingress < 0) {
+ char *name = strjoina("I_", u->id);
+ r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+ if (r < 0)
+ return r;
+
+ *fd_ingress = r;
+ }
+
+ if (*fd_egress < 0) {
+ char *name = strjoina("E_", u->id);
+ r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
+ if (r < 0)
+ return r;
+
+ *fd_egress = r;
+ }
+
+ } else {
+ *fd_ingress = safe_close(*fd_ingress);
+ *fd_egress = safe_close(*fd_egress);
+
+ zero(u->ip_accounting_extra);
+ }
+
+ return 0;
+}
+
+int bpf_firewall_compile(Unit *u) {
+ const char *ingress_name = NULL, *egress_name = NULL;
+ bool ip_allow_any = false, ip_deny_any = false;
+ CGroupContext *cc;
+ int r, supported;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return -EINVAL;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+ if (supported == BPF_FIREWALL_UNSUPPORTED)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF firewalling not supported, proceeding without.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+ /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
+ * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
+ * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
+ * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
+ * all, either. */
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
+
+ /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
+ * kernel). */
+ if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ ingress_name = "sd_fw_ingress";
+ egress_name = "sd_fw_egress";
+ }
+
+ /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
+ * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
+ * configuration, but we don't flush out the accounting unnecessarily */
+
+ u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+ u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+
+ u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+ u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+
+ u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+ u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+ if (u->type != UNIT_SLICE) {
+ /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
+ * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
+ * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
+ * means that all configure IP access rules *will* take effect on processes, even though we never
+ * compile them for inner nodes. */
+
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
+
+ r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
+ }
+
+ r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
+
+ r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
+
+ r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
+
+ return 0;
+}
+
+static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
+ set_clear(*set);
+
+ STRV_FOREACH(bpf_fs_path, filter_paths) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ int r;
+
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
+
+ r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
+
+ r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int bpf_firewall_load_custom(Unit *u) {
+ CGroupContext *cc;
+ int r, supported;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
+ return 0;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
+ if (r < 0)
+ return r;
+ r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
+ BPFProgram *prog;
+ int r;
+
+ assert(u);
+
+ set_clear(*set_installed);
+ r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ SET_FOREACH_MOVE(prog, *set_installed, *set) {
+ r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
+ }
+ return 0;
+}
+
+int bpf_firewall_install(Unit *u) {
+ _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
+ _cleanup_free_ char *path = NULL;
+ CGroupContext *cc;
+ int r, supported;
+ uint32_t flags;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return -EINVAL;
+ if (!u->cgroup_path)
+ return -EINVAL;
+ if (!u->cgroup_realized)
+ return -EINVAL;
+
+ supported = bpf_firewall_supported();
+ if (supported < 0)
+ return supported;
+ if (supported == BPF_FIREWALL_UNSUPPORTED)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF firewalling not supported, proceeding without.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
+ if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
+ (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
+
+ flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
+
+ if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
+ /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
+ * after attaching the new programs, so that there's no time window where neither program is
+ * attached. (There will be a program where both are attached, but that's OK, since this is a
+ * security feature where we rather want to lock down too much than too little */
+ ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
+ ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
+ } else {
+ /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
+ * detach them) right before attaching the new program, to minimize the time window when we
+ * don't account for IP traffic. */
+ u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+ u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+ }
+
+ if (u->ip_bpf_egress) {
+ r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
+
+ /* Remember that this BPF program is installed now. */
+ u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
+ }
+
+ if (u->ip_bpf_ingress) {
+ r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
+
+ u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
+ }
+
+ /* And now, definitely get rid of the old programs, and detach them */
+ ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
+ ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
+
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
+ if (r < 0)
+ return r;
+
+ r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
+ uint64_t key, packets;
+ int r;
+
+ if (map_fd < 0)
+ return -EBADF;
+
+ if (ret_packets) {
+ key = MAP_KEY_PACKETS;
+ r = bpf_map_lookup_element(map_fd, &key, &packets);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_bytes) {
+ key = MAP_KEY_BYTES;
+ r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_packets)
+ *ret_packets = packets;
+
+ return 0;
+}
+
+int bpf_firewall_reset_accounting(int map_fd) {
+ uint64_t key, value = 0;
+ int r;
+
+ if (map_fd < 0)
+ return -EBADF;
+
+ key = MAP_KEY_PACKETS;
+ r = bpf_map_update_element(map_fd, &key, &value);
+ if (r < 0)
+ return r;
+
+ key = MAP_KEY_BYTES;
+ return bpf_map_update_element(map_fd, &key, &value);
+}
+
+static int bpf_firewall_unsupported_reason = 0;
+
+int bpf_firewall_supported(void) {
+ const struct bpf_insn trivial[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN()
+ };
+
+ _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
+ static int supported = -1;
+ union bpf_attr attr;
+ int r;
+
+ /* Checks whether BPF firewalling is supported. For this, we check the following things:
+ *
+ * - whether the unified hierarchy is being used
+ * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
+ * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
+ */
+ if (supported >= 0)
+ return supported;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
+ if (r == 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
+ "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
+ r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ r = bpf_program_load_kernel(program, NULL, 0);
+ if (r < 0) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
+ * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
+ * program if we can't do a thing with it later?
+ *
+ * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
+ * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
+ * parameters are validated however, and that'll fail with EBADF then. */
+
+ // FIXME: Clang doesn't 0-pad with structured initialization, causing
+ // the kernel to reject the bpf_attr as invalid. See:
+ // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
+ // Ideally it should behave like GCC, so that we can remove these workarounds.
+ zero(attr);
+ attr.attach_type = BPF_CGROUP_INET_EGRESS;
+ attr.target_fd = -EBADF;
+ attr.attach_bpf_fd = -EBADF;
+
+ if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
+ if (errno != EBADF) {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* YAY! */
+ } else {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+ "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
+ "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+
+ /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
+ * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
+ * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
+ * get EINVAL if it's not supported, and EBADF as before if it is available.
+ * Use probe result as the indicator that program name is also supported since they both were
+ * added in kernel 4.15. */
+
+ zero(attr);
+ attr.attach_type = BPF_CGROUP_INET_EGRESS;
+ attr.target_fd = -EBADF;
+ attr.attach_bpf_fd = -EBADF;
+ attr.attach_flags = BPF_F_ALLOW_MULTI;
+
+ if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
+ if (errno == EBADF) {
+ log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
+ return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
+ }
+
+ if (errno == EINVAL)
+ log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
+ else
+ log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
+
+ return supported = BPF_FIREWALL_SUPPORTED;
+ } else {
+ bpf_firewall_unsupported_reason =
+ log_debug_errno(SYNTHETIC_ERRNO(EBADE),
+ "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
+ "Something is weird, assuming BPF firewalling is broken and hence not supported.");
+ return supported = BPF_FIREWALL_UNSUPPORTED;
+ }
+}
+
+void emit_bpf_firewall_warning(Unit *u) {
+ static bool warned = false;
+
+ assert(u);
+ assert(u->manager);
+
+ if (warned || MANAGER_IS_TEST_RUN(u->manager))
+ return;
+
+ bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
+
+ log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
+ "unit configures an IP firewall, but %s.\n"
+ "(This warning is only shown for the first unit using IP firewalling.)",
+ getuid() != 0 ? "not running as root" :
+ "the local system does not support BPF/cgroup firewalling");
+ warned = true;
+}
+
+void bpf_firewall_close(Unit *u) {
+ assert(u);
+
+ u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
+ u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
+
+ u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
+ u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
+ u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
+ u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
+
+ u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
+ u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
+ u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
+ u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
+
+ u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
+ u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
+ u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
+ u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
+}
diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h
new file mode 100644
index 0000000..58b401f
--- /dev/null
+++ b/src/core/bpf-firewall.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+#include "unit.h"
+
+enum {
+ BPF_FIREWALL_UNSUPPORTED = 0,
+ BPF_FIREWALL_SUPPORTED = 1,
+ BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2,
+};
+
+int bpf_firewall_supported(void);
+
+int bpf_firewall_compile(Unit *u);
+int bpf_firewall_install(Unit *u);
+int bpf_firewall_load_custom(Unit *u);
+
+int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets);
+int bpf_firewall_reset_accounting(int map_fd);
+
+void emit_bpf_firewall_warning(Unit *u);
+
+void bpf_firewall_close(Unit *u);
diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c
new file mode 100644
index 0000000..cff2f61
--- /dev/null
+++ b/src/core/bpf-foreign.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-foreign.h"
+#include "bpf-program.h"
+#include "cgroup.h"
+#include "memory-util.h"
+#include "missing_magic.h"
+#include "mountpoint-util.h"
+#include "set.h"
+#include "stat-util.h"
+
+typedef struct BPFForeignKey BPFForeignKey;
+struct BPFForeignKey {
+ uint32_t prog_id;
+ uint32_t attach_type;
+};
+
+static int bpf_foreign_key_new(uint32_t prog_id,
+ enum bpf_attach_type attach_type,
+ BPFForeignKey **ret) {
+ _cleanup_free_ BPFForeignKey *p = NULL;
+
+ assert(ret);
+
+ p = new(BPFForeignKey, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFForeignKey) {
+ .prog_id = prog_id,
+ .attach_type = attach_type,
+ };
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) {
+ int r = CMP(a->prog_id, b->prog_id);
+ if (r != 0)
+ return r;
+
+ return CMP(a->attach_type, b->attach_type);
+}
+
+static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) {
+ siphash24_compress(&p->prog_id, sizeof(p->prog_id), h);
+ siphash24_compress(&p->attach_type, sizeof(p->attach_type), h);
+}
+
+DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops,
+ BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free,
+ BPFProgram, bpf_program_free);
+
+static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) {
+ const BPFForeignKey *key;
+ BPFProgram *prog;
+ int r, ret = 0;
+
+ assert(u);
+
+ HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) {
+ r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "bpf-foreign: Attaching foreign BPF program to cgroup %s failed: %m", path);
+ if (ret >= 0)
+ ret = r;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Prepare foreign BPF program for installation:
+ * - Load the program from BPF filesystem to the kernel;
+ * - Store program FD identified by program ID and attach type in the unit.
+ */
+static int bpf_foreign_prepare(
+ Unit *u,
+ enum bpf_attach_type attach_type,
+ const char *bpffs_path) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ _cleanup_free_ BPFForeignKey *key = NULL;
+ uint32_t prog_id;
+ int r;
+
+ assert(u);
+ assert(bpffs_path);
+
+ r = path_is_fs_type(bpffs_path, BPF_FS_MAGIC);
+ if (r == -ENOENT) {
+ log_unit_warning_errno(u, r, "bpf-foreign: foreign program %s does not exist, skipping.", bpffs_path);
+ return 0;
+ }
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-foreign: Failed to determine filesystem type of %s: %m", bpffs_path);
+ if (r == 0)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-foreign: Path in BPF filesystem is expected.");
+
+ r = bpf_program_new_from_bpffs_path(bpffs_path, &prog);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m");
+
+ r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to get BPF program id from fd: %m");
+
+ r = bpf_foreign_key_new(prog_id, attach_type, &key);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path);
+
+ r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog);
+ if (r == -EEXIST) {
+ log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m");
+ return 0;
+ }
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to put foreign BPF program into map: %m");
+
+ TAKE_PTR(key);
+ TAKE_PTR(prog);
+
+ return 0;
+}
+
+int bpf_foreign_install(Unit *u) {
+ _cleanup_free_ char *cgroup_path = NULL;
+ CGroupContext *cc;
+ int r, ret = 0;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m");
+
+ LIST_FOREACH(programs, p, cc->bpf_foreign_programs) {
+ r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path);
+ if (r < 0 && ret >= 0)
+ ret = r;
+ }
+
+ r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI);
+ return ret < 0 ? ret : r;
+}
diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h
new file mode 100644
index 0000000..e387b1b
--- /dev/null
+++ b/src/core/bpf-foreign.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include "unit.h"
+
+static inline int bpf_foreign_supported(void) {
+ return cg_all_unified();
+}
+
+/*
+ * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity
+ * external to systemd.
+ */
+int bpf_foreign_install(Unit *u);
diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c
new file mode 100644
index 0000000..216fc34
--- /dev/null
+++ b/src/core/bpf-lsm.c
@@ -0,0 +1,320 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "filesystems.h"
+#include "log.h"
+#include "lsm-util.h"
+#include "manager.h"
+#include "mkdir.h"
+#include "nulstr-util.h"
+#include "stat-util.h"
+#include "strv.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+
+#define CGROUP_HASH_SIZE_MAX 2048
+
+static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) {
+ /* restrict_fs_bpf__destroy handles object == NULL case */
+ (void) restrict_fs_bpf__destroy(obj);
+
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
+
+static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+ assert(prog);
+
+ link = sym_bpf_program__attach_lsm(prog);
+
+ /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
+ * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
+ * BPF_LSM_MAC attach type) is not supported. */
+ return sym_libbpf_get_error(link) == 0;
+}
+
+static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ _cleanup_close_ int inner_map_fd = -EBADF;
+ int r;
+
+ assert(ret_obj);
+
+ obj = restrict_fs_bpf__open();
+ if (!obj)
+ return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m");
+
+ /* TODO Maybe choose a number based on runtime information? */
+ r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m",
+ sym_bpf_map__name(obj->maps.cgroup_hash));
+
+ /* Dummy map to satisfy the verifier */
+ inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL);
+ if (inner_map_fd < 0)
+ return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m");
+
+ r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m");
+
+ r = restrict_fs_bpf__load(obj);
+ assert(r <= 0);
+ if (r < 0)
+ return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m");
+
+ *ret_obj = TAKE_PTR(obj);
+
+ return 0;
+}
+
+bool lsm_bpf_supported(bool initialize) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+ if (!initialize)
+ return false;
+
+ if (!cgroup_bpf_supported())
+ return (supported = false);
+
+ r = lsm_supported("bpf");
+ if (r < 0) {
+ log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m");
+ return (supported = false);
+ }
+ if (r == 0) {
+ log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported");
+ return (supported = false);
+ }
+
+ r = prepare_restrict_fs_bpf(&obj);
+ if (r < 0)
+ return (supported = false);
+
+ if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) {
+ log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-lsm: Failed to link program; assuming BPF LSM is not available");
+ return (supported = false);
+ }
+
+ return (supported = true);
+}
+
+int lsm_bpf_setup(Manager *m) {
+ _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+ int r;
+
+ assert(m);
+
+ r = prepare_restrict_fs_bpf(&obj);
+ if (r < 0)
+ return r;
+
+ link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems);
+ r = sym_libbpf_get_error(link);
+ if (r != 0)
+ return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m",
+ sym_bpf_program__name(obj->progs.restrict_filesystems));
+
+ log_info("bpf-lsm: LSM BPF program attached");
+
+ obj->links.restrict_filesystems = TAKE_PTR(link);
+ m->restrict_fs = TAKE_PTR(obj);
+
+ return 0;
+}
+
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list) {
+ uint32_t dummy_value = 1, zero = 0;
+ const char *fs;
+ const statfs_f_type_t *magic;
+ int r;
+
+ assert(filesystems);
+ assert(outer_map_fd >= 0);
+
+ int inner_map_fd = compat_bpf_map_create(
+ BPF_MAP_TYPE_HASH,
+ NULL,
+ sizeof(uint32_t),
+ sizeof(uint32_t),
+ 128U, /* Should be enough for all filesystem types */
+ NULL);
+ if (inner_map_fd < 0)
+ return log_error_errno(errno, "bpf-lsm: Failed to create inner BPF map: %m");
+
+ if (sym_bpf_map_update_elem(outer_map_fd, &cgroup_id, &inner_map_fd, BPF_ANY) != 0)
+ return log_error_errno(errno, "bpf-lsm: Error populating BPF map: %m");
+
+ uint32_t allow = allow_list;
+
+ /* Use key 0 to store whether this is an allow list or a deny list */
+ if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0)
+ return log_error_errno(errno, "bpf-lsm: Error initializing map: %m");
+
+ SET_FOREACH(fs, filesystems) {
+ r = fs_type_from_string(fs, &magic);
+ if (r < 0) {
+ log_warning("bpf-lsm: Invalid filesystem name '%s', ignoring.", fs);
+ continue;
+ }
+
+ log_debug("bpf-lsm: Restricting filesystem access to '%s'", fs);
+
+ for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
+ if (magic[i] == 0)
+ break;
+
+ if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) {
+ r = log_error_errno(errno, "bpf-lsm: Failed to update BPF map: %m");
+
+ if (sym_bpf_map_delete_elem(outer_map_fd, &cgroup_id) != 0)
+ log_debug_errno(errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m");
+
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+ assert(u);
+ assert(u->manager);
+
+ /* If we never successfully detected support, there is nothing to clean up. */
+ if (!lsm_bpf_supported(/* initialize = */ false))
+ return 0;
+
+ if (!u->manager->restrict_fs)
+ return 0;
+
+ if (u->cgroup_id == 0)
+ return 0;
+
+ int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
+ if (fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m");
+
+ if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT)
+ return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m");
+
+ return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+ assert(unit);
+ assert(unit->manager);
+
+ if (!unit->manager->restrict_fs)
+ return -ENOMEDIUM;
+
+ return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash);
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+ restrict_fs_bpf__destroy(prog);
+}
+#else /* ! BPF_FRAMEWORK */
+bool lsm_bpf_supported(bool initialize) {
+ return false;
+}
+
+int lsm_bpf_setup(Manager *m) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m");
+}
+
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, const bool allow_list) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m");
+}
+
+int lsm_bpf_cleanup(const Unit *u) {
+ return 0;
+}
+
+int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
+ return -ENOMEDIUM;
+}
+
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
+ return;
+}
+#endif
+
+int lsm_bpf_parse_filesystem(
+ const char *name,
+ Set **filesystems,
+ FilesystemParseFlags flags,
+ const char *unit,
+ const char *filename,
+ unsigned line) {
+ int r;
+
+ assert(name);
+ assert(filesystems);
+
+ if (name[0] == '@') {
+ const FilesystemSet *set;
+
+ set = filesystem_set_find(name);
+ if (!set) {
+ log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
+ "bpf-lsm: Unknown filesystem group, ignoring: %s", name);
+ return 0;
+ }
+
+ NULSTR_FOREACH(i, set->value) {
+ /* Call ourselves again, for the group to parse. Note that we downgrade logging here
+ * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table
+ * are our own problem, not a problem in user configuration data and we shouldn't
+ * pretend otherwise by complaining about them. */
+ r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+ } else {
+ /* If we previously wanted to forbid access to a filesystem and now
+ * we want to allow it, then remove it from the list. */
+ if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) {
+ r = set_put_strdup(filesystems, name);
+ if (r == -ENOMEM)
+ return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM;
+ if (r < 0 && r != -EEXIST) /* When already in set, ignore */
+ return r;
+ } else
+ free(set_remove(*filesystems, name));
+ }
+
+ return 0;
+}
diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h
new file mode 100644
index 0000000..a6eda19
--- /dev/null
+++ b/src/core/bpf-lsm.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hashmap.h"
+
+typedef enum FilesystemParseFlags {
+ FILESYSTEM_PARSE_INVERT = 1 << 0,
+ FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
+ FILESYSTEM_PARSE_LOG = 1 << 2,
+} FilesystemParseFlags;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+
+typedef struct restrict_fs_bpf restrict_fs_bpf;
+
+bool lsm_bpf_supported(bool initialize);
+int lsm_bpf_setup(Manager *m);
+int lsm_bpf_restrict_filesystems(const Set *filesystems, uint64_t cgroup_id, int outer_map_fd, bool allow_list);
+int lsm_bpf_cleanup(const Unit *u);
+int lsm_bpf_map_restrict_fs_fd(Unit *u);
+void lsm_bpf_destroy(struct restrict_fs_bpf *prog);
+int lsm_bpf_parse_filesystem(const char *name,
+ Set **filesystems,
+ FilesystemParseFlags flags,
+ const char *unit,
+ const char *filename,
+ unsigned line);
diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c
new file mode 100644
index 0000000..9f290ab
--- /dev/null
+++ b/src/core/bpf-socket-bind.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if BPF_FRAMEWORK
+#include <bpf/bpf.h>
+#endif
+
+#include "fd-util.h"
+#include "bpf-socket-bind.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/socket_bind/socket-bind-api.bpf.h"
+#include "bpf/socket_bind/socket-bind-skel.h"
+
+static struct socket_bind_bpf *socket_bind_bpf_free(struct socket_bind_bpf *obj) {
+ /* socket_bind_bpf__destroy handles object == NULL case */
+ (void) socket_bind_bpf__destroy(obj);
+
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct socket_bind_bpf *, socket_bind_bpf_free);
+
+static int update_rules_map(
+ int map_fd,
+ CGroupSocketBindItem *head) {
+
+ uint32_t i = 0;
+
+ assert(map_fd >= 0);
+
+ LIST_FOREACH(socket_bind_items, item, head) {
+ struct socket_bind_rule val = {
+ .address_family = (uint32_t) item->address_family,
+ .protocol = item->ip_protocol,
+ .nr_ports = item->nr_ports,
+ .port_min = item->port_min,
+ };
+
+ uint32_t key = i++;
+
+ if (sym_bpf_map_update_elem(map_fd, &key, &val, BPF_ANY) != 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int prepare_socket_bind_bpf(
+ Unit *u,
+ CGroupSocketBindItem *allow,
+ CGroupSocketBindItem *deny,
+ struct socket_bind_bpf **ret_obj) {
+
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ size_t allow_count = 0, deny_count = 0;
+ int allow_map_fd, deny_map_fd, r;
+
+ assert(ret_obj);
+
+ LIST_FOREACH(socket_bind_items, item, allow)
+ allow_count++;
+
+ LIST_FOREACH(socket_bind_items, item, deny)
+ deny_count++;
+
+ if (allow_count > SOCKET_BIND_MAX_RULES)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+ if (deny_count > SOCKET_BIND_MAX_RULES)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL),
+ "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES);
+
+ obj = socket_bind_bpf__open();
+ if (!obj)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "bpf-socket-bind: Failed to open BPF object: %m");
+
+ if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_allow, MAX(allow_count, 1u)) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+ if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_deny, MAX(deny_count, 1u)) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+ if (socket_bind_bpf__load(obj) != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno,
+ "bpf-socket-bind: Failed to load BPF object: %m");
+
+ allow_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_allow);
+ assert(allow_map_fd >= 0);
+
+ r = update_rules_map(allow_map_fd, allow);
+ if (r < 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "bpf-socket-bind: Failed to put socket bind allow rules into BPF map '%s'",
+ sym_bpf_map__name(obj->maps.sd_bind_allow));
+
+ deny_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_deny);
+ assert(deny_map_fd >= 0);
+
+ r = update_rules_map(deny_map_fd, deny);
+ if (r < 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "bpf-socket-bind: Failed to put socket bind deny rules into BPF map '%s'",
+ sym_bpf_map__name(obj->maps.sd_bind_deny));
+
+ *ret_obj = TAKE_PTR(obj);
+ return 0;
+}
+
+int bpf_socket_bind_supported(void) {
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ int r;
+
+ if (!cgroup_bpf_supported())
+ return false;
+
+ if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) {
+ log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported");
+ return false;
+ }
+
+ r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj);
+ if (r < 0) {
+ log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m");
+ return false;
+ }
+
+ return bpf_can_link_program(obj->progs.sd_bind4);
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+ int r;
+
+ assert(u);
+
+ if (!u->initial_socket_bind_link_fds) {
+ u->initial_socket_bind_link_fds = fdset_new();
+ if (!u->initial_socket_bind_link_fds)
+ return log_oom();
+ }
+
+ r = fdset_put(u->initial_socket_bind_link_fds, fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd);
+
+ return 0;
+}
+
+static int socket_bind_install_impl(Unit *u) {
+ _cleanup_(bpf_link_freep) struct bpf_link *ipv4 = NULL, *ipv6 = NULL;
+ _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL;
+ _cleanup_free_ char *cgroup_path = NULL;
+ _cleanup_close_ int cgroup_fd = -EBADF;
+ CGroupContext *cc;
+ int r;
+
+ assert(u);
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m");
+
+ if (!cc->socket_bind_allow && !cc->socket_bind_deny)
+ return 0;
+
+ r = prepare_socket_bind_bpf(u, cc->socket_bind_allow, cc->socket_bind_deny, &obj);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to load BPF object: %m");
+
+ cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC, 0);
+ if (cgroup_fd < 0)
+ return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path);
+
+ ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd);
+ r = sym_libbpf_get_error(ipv4);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+ sym_bpf_program__name(obj->progs.sd_bind4));
+
+ ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd);
+ r = sym_libbpf_get_error(ipv6);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m",
+ sym_bpf_program__name(obj->progs.sd_bind6));
+
+ u->ipv4_socket_bind_link = TAKE_PTR(ipv4);
+ u->ipv6_socket_bind_link = TAKE_PTR(ipv6);
+
+ return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+ int r;
+
+ assert(u);
+
+ r = socket_bind_install_impl(u);
+ if (r == -ENOMEM)
+ return r;
+
+ fdset_close(u->initial_socket_bind_link_fds);
+ return r;
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+
+ r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link);
+ if (r < 0)
+ return r;
+
+ return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link);
+}
+
+#else /* ! BPF_FRAMEWORK */
+int bpf_socket_bind_supported(void) {
+ return false;
+}
+
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) {
+ return 0;
+}
+
+int bpf_socket_bind_install(Unit *u) {
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-socket-bind: Failed to install; BPF framework is not supported");
+}
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) {
+ return 0;
+}
+#endif
diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h
new file mode 100644
index 0000000..7d426df
--- /dev/null
+++ b/src/core/bpf-socket-bind.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+int bpf_socket_bind_supported(void);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec. FDs will be closed at the end of
+ * socket_bind_install. */
+int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd);
+
+int bpf_socket_bind_install(Unit *u);
+
+int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds);
diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c
new file mode 100644
index 0000000..6fe229e
--- /dev/null
+++ b/src/core/bpf-util.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-dlopen.h"
+#include "bpf-util.h"
+#include "cgroup-util.h"
+#include "initrd-util.h"
+#include "log.h"
+
+bool cgroup_bpf_supported(void) {
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0) {
+ log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m");
+ return (supported = false);
+ }
+
+ if (r == 0) {
+ log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Not running with unified cgroup hierarchy, disabling cgroup BPF features.");
+ return (supported = false);
+ }
+
+ r = dlopen_bpf();
+ if (r < 0) {
+ log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO,
+ r, "Failed to open libbpf, cgroup BPF features disabled: %m");
+ return (supported = false);
+ }
+
+ return (supported = true);
+}
diff --git a/src/core/bpf-util.h b/src/core/bpf-util.h
new file mode 100644
index 0000000..a6c55cd
--- /dev/null
+++ b/src/core/bpf-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdbool.h>
+
+bool cgroup_bpf_supported(void);
diff --git a/src/core/bpf/restrict_fs/meson.build b/src/core/bpf/restrict_fs/meson.build
new file mode 100644
index 0000000..69cde02
--- /dev/null
+++ b/src/core/bpf/restrict_fs/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+restrict_fs_bpf_o_unstripped = custom_target(
+ 'restrict-fs.bpf.unstripped.o',
+ input : 'restrict-fs.bpf.c',
+ output : 'restrict-fs.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+restrict_fs_bpf_o = custom_target(
+ 'restrict-fs.bpf.o',
+ input : restrict_fs_bpf_o_unstripped,
+ output : 'restrict-fs.bpf.o',
+ command : bpf_o_cmd)
+
+restrict_fs_skel_h = custom_target(
+ 'restrict-fs.skel.h',
+ input : restrict_fs_bpf_o,
+ output : 'restrict-fs.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/restrict_fs/restrict-fs-skel.h b/src/core/bpf/restrict_fs/restrict-fs-skel.h
new file mode 100644
index 0000000..412cf62
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_fs/restrict-fs.skel.h"
diff --git a/src/core/bpf/restrict_fs/restrict-fs.bpf.c b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
new file mode 100644
index 0000000..eb5ed3e
--- /dev/null
+++ b/src/core/bpf/restrict_fs/restrict-fs.bpf.c
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct super_block {
+ unsigned long int s_magic;
+} __attribute__((preserve_access_index));
+
+struct inode {
+ struct super_block *i_sb;
+} __attribute__((preserve_access_index));
+
+struct file {
+ struct inode *f_inode;
+} __attribute__((preserve_access_index));
+
+/*
+ * max_entries is set from user space with the bpf_map__set_max_entries helper.
+ * */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __type(key, uint64_t); /* cgroup ID */
+ __type(value, uint32_t); /* fs magic set */
+} cgroup_hash SEC(".maps");
+
+SEC("lsm/file_open")
+int BPF_PROG(restrict_filesystems, struct file *file, int ret)
+{
+ unsigned long raw_magic_number;
+ uint64_t cgroup_id;
+ uint32_t *value, *magic_map, magic_number, zero = 0, *is_allow;
+
+ /* ret is the return value from the previous BPF program or 0 if it's
+ * the first hook */
+ if (ret != 0)
+ return ret;
+
+ BPF_CORE_READ_INTO(&raw_magic_number, file, f_inode, i_sb, s_magic);
+ /* super_block.s_magic is unsigned long, but magic_map keys are
+ * uint32_t. Using s_magic as-is would fail on big-endian systems,
+ * which have 64-bit unsigned long. So cast it. */
+ magic_number = (uint32_t)raw_magic_number;
+
+ cgroup_id = bpf_get_current_cgroup_id();
+
+ magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id);
+ if (!magic_map)
+ return 0;
+
+ is_allow = bpf_map_lookup_elem(magic_map, &zero);
+ if (!is_allow)
+ /* Malformed map, it doesn't include whether it's an allow list
+ * or a deny list. Allow. */
+ return 0;
+
+ if (*is_allow) {
+ /* Allow-list: Allow access only if magic_number present in inner map */
+ if (!bpf_map_lookup_elem(magic_map, &magic_number))
+ return -EPERM;
+ } else {
+ /* Deny-list: Allow access only if magic_number is not present in inner map */
+ if (bpf_map_lookup_elem(magic_map, &magic_number))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/core/bpf/restrict_ifaces/meson.build b/src/core/bpf/restrict_ifaces/meson.build
new file mode 100644
index 0000000..5f36178
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+restrict_ifaces_bpf_o_unstripped = custom_target(
+ 'restrict-ifaces.bpf.unstripped.o',
+ input : 'restrict-ifaces.bpf.c',
+ output : 'restrict-ifaces.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+restrict_ifaces_bpf_o = custom_target(
+ 'restrict-ifaces.bpf.o',
+ input : restrict_ifaces_bpf_o_unstripped,
+ output : 'restrict-ifaces.bpf.o',
+ command : bpf_o_cmd)
+
+restrict_ifaces_skel_h = custom_target(
+ 'restrict-ifaces.skel.h',
+ input : restrict_ifaces_bpf_o,
+ output : 'restrict-ifaces.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
new file mode 100644
index 0000000..f937490
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/restrict_ifaces/restrict-ifaces.skel.h"
diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
new file mode 100644
index 0000000..32cde5c
--- /dev/null
+++ b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* <linux/bpf.h> must precede <bpf/bpf_helpers.h> due to integer types
+ * in bpf helpers signatures.
+ */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+const volatile __u8 is_allow_list = 0;
+
+/* Map containing the network interfaces indexes.
+ * The interpretation of the map depends on the value of is_allow_list.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __u32);
+ __type(value, __u8);
+} sd_restrictif SEC(".maps");
+
+#define DROP 0
+#define PASS 1
+
+static __always_inline int restrict_network_interfaces_impl(const struct __sk_buff *sk) {
+ __u32 zero = 0, ifindex;
+ __u8 *lookup_result;
+
+ ifindex = sk->ifindex;
+ lookup_result = bpf_map_lookup_elem(&sd_restrictif, &ifindex);
+ if (is_allow_list) {
+ /* allow-list: let the packet pass if iface in the list */
+ if (lookup_result)
+ return PASS;
+ } else {
+ /* deny-list: let the packet pass if iface *not* in the list */
+ if (!lookup_result)
+ return PASS;
+ }
+
+ return DROP;
+}
+
+SEC("cgroup_skb/egress")
+int sd_restrictif_e(const struct __sk_buff *sk) {
+ return restrict_network_interfaces_impl(sk);
+}
+
+SEC("cgroup_skb/ingress")
+int sd_restrictif_i(const struct __sk_buff *sk) {
+ return restrict_network_interfaces_impl(sk);
+}
+
+static const char _license[] SEC("license") = "LGPL-2.1-or-later";
diff --git a/src/core/bpf/socket_bind/meson.build b/src/core/bpf/socket_bind/meson.build
new file mode 100644
index 0000000..05a2b9d
--- /dev/null
+++ b/src/core/bpf/socket_bind/meson.build
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('BPF_FRAMEWORK') != 1
+ subdir_done()
+endif
+
+socket_bind_bpf_o_unstripped = custom_target(
+ 'socket-bind.bpf.unstripped.o',
+ input : 'socket-bind.bpf.c',
+ output : 'socket-bind.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd)
+
+socket_bind_bpf_o = custom_target(
+ 'socket-bind.bpf.o',
+ input : socket_bind_bpf_o_unstripped,
+ output : 'socket-bind.bpf.o',
+ command : bpf_o_cmd)
+
+socket_bind_skel_h = custom_target(
+ 'socket-bind.skel.h',
+ input : socket_bind_bpf_o,
+ output : 'socket-bind.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
new file mode 100644
index 0000000..277b9bb
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include <linux/types.h>
+
+/*
+ * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook
+ * through bpf_sock_addr struct.
+ * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6.
+ * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the
+ * rest of a rule applicable for both IPv4 and IPv6 addresses.
+ * If matching by family is either successful or bypassed, a rule and a socket
+ * are matched by ip protocol.
+ * If 'protocol' is 0, matching is bypassed.
+ * 'nr_ports' and 'port_min' fields specify a set of ports to match a user port
+ * with.
+ * If 'nr_ports' is 0, matching by port is bypassed, making that rule applicable
+ * for all possible ports, e.g. [1, 65535] range. Thus a rule with
+ * 'address_family', 'protocol' and 'nr_ports' equal to AF_UNSPEC, 0 and 0
+ * correspondingly forms 'allow any' or 'deny any' cases.
+ * For positive 'nr_ports', a user_port lying in a range from 'port_min' to'
+ * 'port_min' + 'nr_ports' exclusively is considered to be a match. 'nr_ports'
+ * equalling to 1 forms a rule for a single port.
+ * Ports are in host order.
+ *
+ * Examples:
+ * AF_UNSPEC, 1, 0, 7777: match IPv4 and IPv6 addresses with 7777 user port;
+ *
+ * AF_INET, 1023, 0, 1: match IPv4 addresses with user port in [1, 1023]
+ * range inclusively;
+ *
+ * AF_INET6, 0, 0, 0: match IPv6 addresses;
+ *
+ * AF_UNSPEC, 0, 0, 0: match IPv4 and IPv6 addresses;
+ *
+ * AF_INET6, IPPROTO_TCP, 0, 0: match IPv6/TCP addresses.
+ */
+
+struct socket_bind_rule {
+ __u32 address_family;
+ __u32 protocol;
+ __u16 nr_ports;
+ __u16 port_min;
+};
+
+#define SOCKET_BIND_MAX_RULES 128
diff --git a/src/core/bpf/socket_bind/socket-bind-skel.h b/src/core/bpf/socket_bind/socket-bind-skel.h
new file mode 100644
index 0000000..e0d1626
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind-skel.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+
+#include "bpf/socket_bind/socket-bind.skel.h"
diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c
new file mode 100644
index 0000000..b7972a8
--- /dev/null
+++ b/src/core/bpf/socket_bind/socket-bind.bpf.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "socket-bind-api.bpf.h"
+/* <linux/types.h> must precede <bpf/bpf_helpers.h> due to
+ * <bpf/bpf_helpers.h> does not depend from type header by design.
+ */
+#include <linux/types.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+
+/*
+ * max_entries is set from user space with bpf_map__set_max_entries helper.
+ */
+struct socket_bind_map_t {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, struct socket_bind_rule);
+};
+
+enum socket_bind_action {
+ SOCKET_BIND_DENY = 0,
+ SOCKET_BIND_ALLOW = 1,
+};
+
+struct socket_bind_map_t sd_bind_allow SEC(".maps");
+struct socket_bind_map_t sd_bind_deny SEC(".maps");
+
+static __always_inline bool match_af(
+ __u8 address_family, const struct socket_bind_rule *r) {
+ return r->address_family == AF_UNSPEC || address_family == r->address_family;
+}
+
+static __always_inline bool match_protocol(
+ __u32 protocol, const struct socket_bind_rule *r) {
+ return r->protocol == 0 || r->protocol == protocol;
+}
+
+static __always_inline bool match_user_port(
+ __u16 port, const struct socket_bind_rule *r) {
+ return r->nr_ports == 0 ||
+ (port >= r->port_min && port < r->port_min + (__u32) r->nr_ports);
+}
+
+static __always_inline bool match(
+ __u8 address_family,
+ __u32 protocol,
+ __u16 port,
+ const struct socket_bind_rule *r) {
+ return match_af(address_family, r) &&
+ match_protocol(protocol, r) &&
+ match_user_port(port, r);
+}
+
+static __always_inline bool match_rules(
+ struct bpf_sock_addr *ctx,
+ struct socket_bind_map_t *rules) {
+ volatile __u32 user_port = ctx->user_port;
+ __u16 port = (__u16)bpf_ntohs(user_port);
+
+ for (__u32 i = 0; i < SOCKET_BIND_MAX_RULES; ++i) {
+ const __u32 key = i;
+ const struct socket_bind_rule *rule = bpf_map_lookup_elem(rules, &key);
+
+ /* Lookup returns NULL if iterator is advanced past the last
+ * element put in the map. */
+ if (!rule)
+ break;
+
+ if (match(ctx->user_family, ctx->protocol, port, rule))
+ return true;
+ }
+
+ return false;
+}
+
+static __always_inline int bind_socket(struct bpf_sock_addr *ctx) {
+ if (match_rules(ctx, &sd_bind_allow))
+ return SOCKET_BIND_ALLOW;
+
+ if (match_rules(ctx, &sd_bind_deny))
+ return SOCKET_BIND_DENY;
+
+ return SOCKET_BIND_ALLOW;
+}
+
+SEC("cgroup/bind4")
+int sd_bind4(struct bpf_sock_addr *ctx) {
+ if (ctx->user_family != AF_INET || ctx->family != AF_INET)
+ return SOCKET_BIND_ALLOW;
+
+ return bind_socket(ctx);
+}
+
+SEC("cgroup/bind6")
+int sd_bind6(struct bpf_sock_addr *ctx) {
+ if (ctx->user_family != AF_INET6 || ctx->family != AF_INET6)
+ return SOCKET_BIND_ALLOW;
+
+ return bind_socket(ctx);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
new file mode 100644
index 0000000..61ac4df
--- /dev/null
+++ b/src/core/cgroup.c
@@ -0,0 +1,4665 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "bpf-devices.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "btrfs-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "in-addr-prefix-util.h"
+#include "inotify-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "procfs-util.h"
+#include "restrict-ifaces.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "virt.h"
+
+#if BPF_FRAMEWORK
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+#endif
+
+#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
+
+/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
+ * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
+ * out specific attributes from us. */
+#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
+
+uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max) {
+ if (tasks_max->scale == 0)
+ return tasks_max->value;
+
+ return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
+}
+
+bool manager_owns_host_root_cgroup(Manager *m) {
+ assert(m);
+
+ /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
+ * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
+ * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
+ * we run in any kind of container virtualization. */
+
+ if (MANAGER_IS_USER(m))
+ return false;
+
+ if (detect_container() > 0)
+ return false;
+
+ return empty_or_root(m->cgroup_root);
+}
+
+bool unit_has_startup_cgroup_constraints(Unit *u) {
+ assert(u);
+
+ /* Returns true if this unit has any directives which apply during
+ * startup/shutdown phases. */
+
+ CGroupContext *c;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->startup_cpuset_cpus.set ||
+ c->startup_cpuset_mems.set ||
+ c->startup_memory_high_set ||
+ c->startup_memory_max_set ||
+ c->startup_memory_swap_max_set||
+ c->startup_memory_zswap_max_set ||
+ c->startup_memory_low_set;
+}
+
+bool unit_has_host_root_cgroup(Unit *u) {
+ assert(u);
+
+ /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
+ * the manager manages the root cgroup. */
+
+ if (!manager_owns_host_root_cgroup(u->manager))
+ return false;
+
+ return unit_has_name(u, SPECIAL_ROOT_SLICE);
+}
+
+static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
+ int r;
+
+ r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
+ strna(attribute), empty_to_root(u->cgroup_path), (int) strcspn(value, NEWLINE), value);
+
+ return r;
+}
+
+static void cgroup_compat_warn(void) {
+ static bool cgroup_compat_warned = false;
+
+ if (cgroup_compat_warned)
+ return;
+
+ log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
+ "See cgroup-compat debug messages for details.");
+
+ cgroup_compat_warned = true;
+}
+
+#define log_cgroup_compat(unit, fmt, ...) do { \
+ cgroup_compat_warn(); \
+ log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
+ } while (false)
+
+void cgroup_context_init(CGroupContext *c) {
+ assert(c);
+
+ /* Initialize everything to the kernel defaults. When initializing a bool member to 'true', make
+ * sure to serialize in execute-serialize.c using serialize_bool() instead of
+ * serialize_bool_elide(), as sd-executor will initialize here to 'true', but serialize_bool_elide()
+ * skips serialization if the value is 'false' (as that's the common default), so if the value at
+ * runtime is zero it would be lost after deserialization. Same when initializing uint64_t and other
+ * values, update/add a conditional serialization check. This is to minimize the amount of
+ * serialized data that is sent to the sd-executor, so that there is less work to do on the default
+ * cases. */
+
+ *c = (CGroupContext) {
+ .cpu_weight = CGROUP_WEIGHT_INVALID,
+ .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
+ .cpu_quota_per_sec_usec = USEC_INFINITY,
+ .cpu_quota_period_usec = USEC_INFINITY,
+
+ .cpu_shares = CGROUP_CPU_SHARES_INVALID,
+ .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
+
+ .memory_high = CGROUP_LIMIT_MAX,
+ .startup_memory_high = CGROUP_LIMIT_MAX,
+ .memory_max = CGROUP_LIMIT_MAX,
+ .startup_memory_max = CGROUP_LIMIT_MAX,
+ .memory_swap_max = CGROUP_LIMIT_MAX,
+ .startup_memory_swap_max = CGROUP_LIMIT_MAX,
+ .memory_zswap_max = CGROUP_LIMIT_MAX,
+ .startup_memory_zswap_max = CGROUP_LIMIT_MAX,
+
+ .memory_limit = CGROUP_LIMIT_MAX,
+
+ .io_weight = CGROUP_WEIGHT_INVALID,
+ .startup_io_weight = CGROUP_WEIGHT_INVALID,
+
+ .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+ .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
+
+ .tasks_max = CGROUP_TASKS_MAX_UNSET,
+
+ .moom_swap = MANAGED_OOM_AUTO,
+ .moom_mem_pressure = MANAGED_OOM_AUTO,
+ .moom_preference = MANAGED_OOM_PREFERENCE_NONE,
+
+ .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
+ .memory_pressure_threshold_usec = USEC_INFINITY,
+ };
+}
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
+ assert(c);
+ assert(a);
+
+ LIST_REMOVE(device_allow, c->device_allow, a);
+ free(a->path);
+ free(a);
+}
+
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
+ assert(c);
+ assert(w);
+
+ LIST_REMOVE(device_weights, c->io_device_weights, w);
+ free(w->path);
+ free(w);
+}
+
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
+ assert(c);
+ assert(l);
+
+ LIST_REMOVE(device_latencies, c->io_device_latencies, l);
+ free(l->path);
+ free(l);
+}
+
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
+ assert(c);
+ assert(l);
+
+ LIST_REMOVE(device_limits, c->io_device_limits, l);
+ free(l->path);
+ free(l);
+}
+
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
+ assert(c);
+ assert(w);
+
+ LIST_REMOVE(device_weights, c->blockio_device_weights, w);
+ free(w->path);
+ free(w);
+}
+
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
+ assert(c);
+ assert(b);
+
+ LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
+ free(b->path);
+ free(b);
+}
+
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) {
+ assert(c);
+ assert(p);
+
+ LIST_REMOVE(programs, c->bpf_foreign_programs, p);
+ free(p->bpffs_path);
+ free(p);
+}
+
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head) {
+ assert(head);
+
+ LIST_CLEAR(socket_bind_items, *head, free);
+}
+
+void cgroup_context_done(CGroupContext *c) {
+ assert(c);
+
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ while (c->io_device_limits)
+ cgroup_context_free_io_device_limit(c, c->io_device_limits);
+
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ while (c->blockio_device_bandwidths)
+ cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
+
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ cgroup_context_remove_socket_bind(&c->socket_bind_allow);
+ cgroup_context_remove_socket_bind(&c->socket_bind_deny);
+
+ c->ip_address_allow = set_free(c->ip_address_allow);
+ c->ip_address_deny = set_free(c->ip_address_deny);
+
+ c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
+ c->ip_filters_egress = strv_free(c->ip_filters_egress);
+
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+ cpu_set_reset(&c->cpuset_cpus);
+ cpu_set_reset(&c->startup_cpuset_cpus);
+ cpu_set_reset(&c->cpuset_mems);
+ cpu_set_reset(&c->startup_cpuset_mems);
+
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+
+ nft_set_context_clear(&c->nft_set_context);
+}
+
+static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
+ assert(u);
+
+ if (!u->cgroup_realized)
+ return -EOWNERDEAD;
+
+ return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
+}
+
+static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
+ CGroupContext *c;
+ CGroupMask m;
+ const char *file;
+ uint64_t unit_value;
+ int r;
+
+ /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
+ * return -ENODATA) on cgroup v1.
+ *
+ * Returns:
+ *
+ * <0: On error.
+ * 0: If the kernel memory setting doesn't match our configuration.
+ * >0: If the kernel memory setting matches our configuration.
+ *
+ * The following values are only guaranteed to be populated on return >=0:
+ *
+ * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
+ * - ret_kernel_value will contain the actual value presented by the kernel. */
+
+ assert(u);
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
+
+ /* Unsupported on v1.
+ *
+ * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
+ * silently masked the controller. */
+ if (r == 0)
+ return -ENODATA;
+
+ /* The root slice doesn't have any controller files, so we can't compare anything. */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return -ENODATA;
+
+ /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
+ * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
+ * avoid specious errors in these scenarios, check that we even expect the memory controller to be
+ * enabled at all. */
+ m = unit_get_target_mask(u);
+ if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
+ return -ENODATA;
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ bool startup = u->manager && IN_SET(manager_state(u->manager), MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
+
+ if (streq(property_name, "MemoryLow")) {
+ unit_value = unit_get_ancestor_memory_low(u);
+ file = "memory.low";
+ } else if (startup && streq(property_name, "StartupMemoryLow")) {
+ unit_value = unit_get_ancestor_startup_memory_low(u);
+ file = "memory.low";
+ } else if (streq(property_name, "MemoryMin")) {
+ unit_value = unit_get_ancestor_memory_min(u);
+ file = "memory.min";
+ } else if (streq(property_name, "MemoryHigh")) {
+ unit_value = c->memory_high;
+ file = "memory.high";
+ } else if (startup && streq(property_name, "StartupMemoryHigh")) {
+ unit_value = c->startup_memory_high;
+ file = "memory.high";
+ } else if (streq(property_name, "MemoryMax")) {
+ unit_value = c->memory_max;
+ file = "memory.max";
+ } else if (startup && streq(property_name, "StartupMemoryMax")) {
+ unit_value = c->startup_memory_max;
+ file = "memory.max";
+ } else if (streq(property_name, "MemorySwapMax")) {
+ unit_value = c->memory_swap_max;
+ file = "memory.swap.max";
+ } else if (startup && streq(property_name, "StartupMemorySwapMax")) {
+ unit_value = c->startup_memory_swap_max;
+ file = "memory.swap.max";
+ } else if (streq(property_name, "MemoryZSwapMax")) {
+ unit_value = c->memory_zswap_max;
+ file = "memory.zswap.max";
+ } else if (startup && streq(property_name, "StartupMemoryZSwapMax")) {
+ unit_value = c->startup_memory_zswap_max;
+ file = "memory.zswap.max";
+ } else
+ return -EINVAL;
+
+ r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
+
+ /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
+ * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
+ * our internal page-counting. To support those future kernels, just check the value itself first
+ * without any page-alignment. */
+ if (*ret_kernel_value == unit_value) {
+ *ret_unit_value = unit_value;
+ return 1;
+ }
+
+ /* The current kernel behaviour, by comparison, is that even if you write a particular number of
+ * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
+ * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
+ * cricket. */
+ if (unit_value != CGROUP_LIMIT_MAX)
+ unit_value = PAGE_ALIGN_DOWN(unit_value);
+
+ *ret_unit_value = unit_value;
+
+ return *ret_kernel_value == *ret_unit_value;
+}
+
+#define FORMAT_CGROUP_DIFF_MAX 128
+
+static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
+ uint64_t kval, sval;
+ int r;
+
+ assert(u);
+ assert(buf);
+ assert(l > 0);
+
+ r = unit_compare_memory_limit(u, property_name, &sval, &kval);
+
+ /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
+ * In the absence of reliably being able to detect whether memcg swap support is available or not,
+ * only complain if the error is not ENOENT. This is similarly the case for memory.zswap.max relying
+ * on CONFIG_ZSWAP. */
+ if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
+ (r == -ENOENT && STR_IN_SET(property_name,
+ "MemorySwapMax",
+ "StartupMemorySwapMax",
+ "MemoryZSwapMax",
+ "StartupMemoryZSwapMax")))
+ buf[0] = 0;
+ else if (r < 0) {
+ errno = -r;
+ (void) snprintf(buf, l, " (error getting kernel value: %m)");
+ } else
+ (void) snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
+
+ return buf;
+}
+
+const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) {
+ static const char *table[_CGROUP_DEVICE_PERMISSIONS_MAX] = {
+ /* Lets simply define a table with every possible combination. As long as those are just 8 we
+ * can get away with it. If this ever grows to more we need to revisit this logic though. */
+ [0] = "",
+ [CGROUP_DEVICE_READ] = "r",
+ [CGROUP_DEVICE_WRITE] = "w",
+ [CGROUP_DEVICE_MKNOD] = "m",
+ [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE] = "rw",
+ [CGROUP_DEVICE_READ|CGROUP_DEVICE_MKNOD] = "rm",
+ [CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "wm",
+ [CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD] = "rwm",
+ };
+
+ if (p < 0 || p >= _CGROUP_DEVICE_PERMISSIONS_MAX)
+ return NULL;
+
+ return table[p];
+}
+
+CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) {
+ CGroupDevicePermissions p = 0;
+
+ if (!s)
+ return _CGROUP_DEVICE_PERMISSIONS_INVALID;
+
+ for (const char *c = s; *c; c++) {
+ if (*c == 'r')
+ p |= CGROUP_DEVICE_READ;
+ else if (*c == 'w')
+ p |= CGROUP_DEVICE_WRITE;
+ else if (*c == 'm')
+ p |= CGROUP_DEVICE_MKNOD;
+ else
+ return _CGROUP_DEVICE_PERMISSIONS_INVALID;
+ }
+
+ return p;
+}
+
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
+ _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL, *startup_cpuset_mems = NULL;
+ CGroupContext *c;
+ struct in_addr_prefix *iaai;
+
+ char cda[FORMAT_CGROUP_DIFF_MAX];
+ char cdb[FORMAT_CGROUP_DIFF_MAX];
+ char cdc[FORMAT_CGROUP_DIFF_MAX];
+ char cdd[FORMAT_CGROUP_DIFF_MAX];
+ char cde[FORMAT_CGROUP_DIFF_MAX];
+ char cdf[FORMAT_CGROUP_DIFF_MAX];
+ char cdg[FORMAT_CGROUP_DIFF_MAX];
+ char cdh[FORMAT_CGROUP_DIFF_MAX];
+ char cdi[FORMAT_CGROUP_DIFF_MAX];
+ char cdj[FORMAT_CGROUP_DIFF_MAX];
+ char cdk[FORMAT_CGROUP_DIFF_MAX];
+
+ assert(u);
+ assert(f);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ prefix = strempty(prefix);
+
+ (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+ (void) cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
+
+ /* "Delegate=" means "yes, but no controllers". Show this as "(none)". */
+ const char *delegate_str = delegate_controllers_str ?: c->delegate ? "(none)" : "no";
+
+ cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
+ startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
+ cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
+ startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
+
+ fprintf(f,
+ "%sCPUAccounting: %s\n"
+ "%sIOAccounting: %s\n"
+ "%sBlockIOAccounting: %s\n"
+ "%sMemoryAccounting: %s\n"
+ "%sTasksAccounting: %s\n"
+ "%sIPAccounting: %s\n"
+ "%sCPUWeight: %" PRIu64 "\n"
+ "%sStartupCPUWeight: %" PRIu64 "\n"
+ "%sCPUShares: %" PRIu64 "\n"
+ "%sStartupCPUShares: %" PRIu64 "\n"
+ "%sCPUQuotaPerSecSec: %s\n"
+ "%sCPUQuotaPeriodSec: %s\n"
+ "%sAllowedCPUs: %s\n"
+ "%sStartupAllowedCPUs: %s\n"
+ "%sAllowedMemoryNodes: %s\n"
+ "%sStartupAllowedMemoryNodes: %s\n"
+ "%sIOWeight: %" PRIu64 "\n"
+ "%sStartupIOWeight: %" PRIu64 "\n"
+ "%sBlockIOWeight: %" PRIu64 "\n"
+ "%sStartupBlockIOWeight: %" PRIu64 "\n"
+ "%sDefaultMemoryMin: %" PRIu64 "\n"
+ "%sDefaultMemoryLow: %" PRIu64 "\n"
+ "%sMemoryMin: %" PRIu64 "%s\n"
+ "%sMemoryLow: %" PRIu64 "%s\n"
+ "%sStartupMemoryLow: %" PRIu64 "%s\n"
+ "%sMemoryHigh: %" PRIu64 "%s\n"
+ "%sStartupMemoryHigh: %" PRIu64 "%s\n"
+ "%sMemoryMax: %" PRIu64 "%s\n"
+ "%sStartupMemoryMax: %" PRIu64 "%s\n"
+ "%sMemorySwapMax: %" PRIu64 "%s\n"
+ "%sStartupMemorySwapMax: %" PRIu64 "%s\n"
+ "%sMemoryZSwapMax: %" PRIu64 "%s\n"
+ "%sStartupMemoryZSwapMax: %" PRIu64 "%s\n"
+ "%sMemoryLimit: %" PRIu64 "\n"
+ "%sTasksMax: %" PRIu64 "\n"
+ "%sDevicePolicy: %s\n"
+ "%sDisableControllers: %s\n"
+ "%sDelegate: %s\n"
+ "%sManagedOOMSwap: %s\n"
+ "%sManagedOOMMemoryPressure: %s\n"
+ "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
+ "%sManagedOOMPreference: %s\n"
+ "%sMemoryPressureWatch: %s\n"
+ "%sCoredumpReceive: %s\n",
+ prefix, yes_no(c->cpu_accounting),
+ prefix, yes_no(c->io_accounting),
+ prefix, yes_no(c->blockio_accounting),
+ prefix, yes_no(c->memory_accounting),
+ prefix, yes_no(c->tasks_accounting),
+ prefix, yes_no(c->ip_accounting),
+ prefix, c->cpu_weight,
+ prefix, c->startup_cpu_weight,
+ prefix, c->cpu_shares,
+ prefix, c->startup_cpu_shares,
+ prefix, FORMAT_TIMESPAN(c->cpu_quota_per_sec_usec, 1),
+ prefix, FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1),
+ prefix, strempty(cpuset_cpus),
+ prefix, strempty(startup_cpuset_cpus),
+ prefix, strempty(cpuset_mems),
+ prefix, strempty(startup_cpuset_mems),
+ prefix, c->io_weight,
+ prefix, c->startup_io_weight,
+ prefix, c->blockio_weight,
+ prefix, c->startup_blockio_weight,
+ prefix, c->default_memory_min,
+ prefix, c->default_memory_low,
+ prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
+ prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
+ prefix, c->startup_memory_low, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "StartupMemoryLow"),
+ prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryHigh"),
+ prefix, c->startup_memory_high, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "StartupMemoryHigh"),
+ prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdf, sizeof(cdf), u, "MemoryMax"),
+ prefix, c->startup_memory_max, format_cgroup_memory_limit_comparison(cdg, sizeof(cdg), u, "StartupMemoryMax"),
+ prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cdh, sizeof(cdh), u, "MemorySwapMax"),
+ prefix, c->startup_memory_swap_max, format_cgroup_memory_limit_comparison(cdi, sizeof(cdi), u, "StartupMemorySwapMax"),
+ prefix, c->memory_zswap_max, format_cgroup_memory_limit_comparison(cdj, sizeof(cdj), u, "MemoryZSwapMax"),
+ prefix, c->startup_memory_zswap_max, format_cgroup_memory_limit_comparison(cdk, sizeof(cdk), u, "StartupMemoryZSwapMax"),
+ prefix, c->memory_limit,
+ prefix, cgroup_tasks_max_resolve(&c->tasks_max),
+ prefix, cgroup_device_policy_to_string(c->device_policy),
+ prefix, strempty(disable_controllers_str),
+ prefix, delegate_str,
+ prefix, managed_oom_mode_to_string(c->moom_swap),
+ prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
+ prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
+ prefix, managed_oom_preference_to_string(c->moom_preference),
+ prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch),
+ prefix, yes_no(c->coredump_receive));
+
+ if (c->delegate_subgroup)
+ fprintf(f, "%sDelegateSubgroup: %s\n",
+ prefix, c->delegate_subgroup);
+
+ if (c->memory_pressure_threshold_usec != USEC_INFINITY)
+ fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
+ prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
+
+ LIST_FOREACH(device_allow, a, c->device_allow)
+ /* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
+ fprintf(f,
+ "%sDeviceAllow: %s %s\n",
+ prefix,
+ a->path,
+ strna(cgroup_device_permissions_to_string(a->permissions)));
+
+ LIST_FOREACH(device_weights, iw, c->io_device_weights)
+ fprintf(f,
+ "%sIODeviceWeight: %s %" PRIu64 "\n",
+ prefix,
+ iw->path,
+ iw->weight);
+
+ LIST_FOREACH(device_latencies, l, c->io_device_latencies)
+ fprintf(f,
+ "%sIODeviceLatencyTargetSec: %s %s\n",
+ prefix,
+ l->path,
+ FORMAT_TIMESPAN(l->target_usec, 1));
+
+ LIST_FOREACH(device_limits, il, c->io_device_limits)
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ if (il->limits[type] != cgroup_io_limit_defaults[type])
+ fprintf(f,
+ "%s%s: %s %s\n",
+ prefix,
+ cgroup_io_limit_type_to_string(type),
+ il->path,
+ FORMAT_BYTES(il->limits[type]));
+
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+ fprintf(f,
+ "%sBlockIODeviceWeight: %s %" PRIu64,
+ prefix,
+ w->path,
+ w->weight);
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ if (b->rbps != CGROUP_LIMIT_MAX)
+ fprintf(f,
+ "%sBlockIOReadBandwidth: %s %s\n",
+ prefix,
+ b->path,
+ FORMAT_BYTES(b->rbps));
+ if (b->wbps != CGROUP_LIMIT_MAX)
+ fprintf(f,
+ "%sBlockIOWriteBandwidth: %s %s\n",
+ prefix,
+ b->path,
+ FORMAT_BYTES(b->wbps));
+ }
+
+ SET_FOREACH(iaai, c->ip_address_allow)
+ fprintf(f, "%sIPAddressAllow: %s\n", prefix,
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+ SET_FOREACH(iaai, c->ip_address_deny)
+ fprintf(f, "%sIPAddressDeny: %s\n", prefix,
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+
+ STRV_FOREACH(path, c->ip_filters_ingress)
+ fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
+ STRV_FOREACH(path, c->ip_filters_egress)
+ fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
+
+ LIST_FOREACH(programs, p, c->bpf_foreign_programs)
+ fprintf(f, "%sBPFProgram: %s:%s",
+ prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path);
+
+ if (c->socket_bind_allow) {
+ fprintf(f, "%sSocketBindAllow: ", prefix);
+ cgroup_context_dump_socket_bind_items(c->socket_bind_allow, f);
+ fputc('\n', f);
+ }
+
+ if (c->socket_bind_deny) {
+ fprintf(f, "%sSocketBindDeny: ", prefix);
+ cgroup_context_dump_socket_bind_items(c->socket_bind_deny, f);
+ fputc('\n', f);
+ }
+
+ if (c->restrict_network_interfaces) {
+ char *iface;
+ SET_FOREACH(iface, c->restrict_network_interfaces)
+ fprintf(f, "%sRestrictNetworkInterfaces: %s\n", prefix, iface);
+ }
+
+ FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
+ fprintf(f, "%sNFTSet: %s:%s:%s:%s\n", prefix, nft_set_source_to_string(nft_set->source),
+ nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set);
+}
+
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f) {
+ const char *family, *colon1, *protocol = "", *colon2 = "";
+
+ family = strempty(af_to_ipv4_ipv6(item->address_family));
+ colon1 = isempty(family) ? "" : ":";
+
+ if (item->ip_protocol != 0) {
+ protocol = ip_protocol_to_tcp_udp(item->ip_protocol);
+ colon2 = ":";
+ }
+
+ if (item->nr_ports == 0)
+ fprintf(f, "%s%s%s%sany", family, colon1, protocol, colon2);
+ else if (item->nr_ports == 1)
+ fprintf(f, "%s%s%s%s%" PRIu16, family, colon1, protocol, colon2, item->port_min);
+ else {
+ uint16_t port_max = item->port_min + item->nr_ports - 1;
+ fprintf(f, "%s%s%s%s%" PRIu16 "-%" PRIu16, family, colon1, protocol, colon2,
+ item->port_min, port_max);
+ }
+}
+
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f) {
+ bool first = true;
+
+ LIST_FOREACH(socket_bind_items, bi, items) {
+ if (first)
+ first = false;
+ else
+ fputc(' ', f);
+
+ cgroup_context_dump_socket_bind_item(bi, f);
+ }
+}
+
+int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
+ _cleanup_free_ CGroupDeviceAllow *a = NULL;
+ _cleanup_free_ char *d = NULL;
+
+ assert(c);
+ assert(dev);
+ assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+ if (p == 0)
+ p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+
+ a = new(CGroupDeviceAllow, 1);
+ if (!a)
+ return -ENOMEM;
+
+ d = strdup(dev);
+ if (!d)
+ return -ENOMEM;
+
+ *a = (CGroupDeviceAllow) {
+ .path = TAKE_PTR(d),
+ .permissions = p,
+ };
+
+ LIST_PREPEND(device_allow, c->device_allow, a);
+ TAKE_PTR(a);
+
+ return 0;
+}
+
+int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p) {
+ assert(c);
+ assert(dev);
+ assert(p >= 0 && p < _CGROUP_DEVICE_PERMISSIONS_MAX);
+
+ if (p == 0)
+ p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+
+ LIST_FOREACH(device_allow, b, c->device_allow)
+ if (path_equal(b->path, dev)) {
+ b->permissions = p;
+ return 0;
+ }
+
+ return cgroup_context_add_device_allow(c, dev, p);
+}
+
+int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) {
+ CGroupBPFForeignProgram *p;
+ _cleanup_free_ char *d = NULL;
+
+ assert(c);
+ assert(bpffs_path);
+
+ if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m");
+
+ d = strdup(bpffs_path);
+ if (!d)
+ return log_oom();
+
+ p = new(CGroupBPFForeignProgram, 1);
+ if (!p)
+ return log_oom();
+
+ *p = (CGroupBPFForeignProgram) {
+ .attach_type = attach_type,
+ .bpffs_path = TAKE_PTR(d),
+ };
+
+ LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p));
+
+ return 0;
+}
+
+#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
+ uint64_t unit_get_ancestor_##entry(Unit *u) { \
+ CGroupContext *c; \
+ \
+ /* 1. Is entry set in this unit? If so, use that. \
+ * 2. Is the default for this entry set in any \
+ * ancestor? If so, use that. \
+ * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
+ \
+ assert(u); \
+ \
+ c = unit_get_cgroup_context(u); \
+ if (c && c->entry##_set) \
+ return c->entry; \
+ \
+ while ((u = UNIT_GET_SLICE(u))) { \
+ c = unit_get_cgroup_context(u); \
+ if (c && c->default_##entry##_set) \
+ return c->default_##entry; \
+ } \
+ \
+ /* We've reached the root, but nobody had default for \
+ * this entry set, so set it to the kernel default. */ \
+ return CGROUP_LIMIT_MIN; \
+}
+
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(startup_memory_low);
+UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
+
+static void unit_set_xattr_graceful(Unit *u, const char *name, const void *data, size_t size) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (!u->cgroup_path)
+ return;
+
+ r = cg_set_xattr(u->cgroup_path, name, data, size, 0);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set '%s' xattr on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+}
+
+static void unit_remove_xattr_graceful(Unit *u, const char *name) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (!u->cgroup_path)
+ return;
+
+ r = cg_remove_xattr(u->cgroup_path, name);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ log_unit_debug_errno(u, r, "Failed to remove '%s' xattr flag on control group %s, ignoring: %m", name, empty_to_root(u->cgroup_path));
+}
+
+static void cgroup_oomd_xattr_apply(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return;
+
+ if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT)
+ unit_set_xattr_graceful(u, "user.oomd_omit", "1", 1);
+
+ if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID)
+ unit_set_xattr_graceful(u, "user.oomd_avoid", "1", 1);
+
+ if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID)
+ unit_remove_xattr_graceful(u, "user.oomd_avoid");
+
+ if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT)
+ unit_remove_xattr_graceful(u, "user.oomd_omit");
+}
+
+static int cgroup_log_xattr_apply(Unit *u) {
+ ExecContext *c;
+ size_t len, allowed_patterns_len, denied_patterns_len;
+ _cleanup_free_ char *patterns = NULL, *allowed_patterns = NULL, *denied_patterns = NULL;
+ char *last;
+ int r;
+
+ assert(u);
+
+ c = unit_get_exec_context(u);
+ if (!c)
+ /* Some unit types have a cgroup context but no exec context, so we do not log
+ * any error here to avoid confusion. */
+ return 0;
+
+ if (set_isempty(c->log_filter_allowed_patterns) && set_isempty(c->log_filter_denied_patterns)) {
+ unit_remove_xattr_graceful(u, "user.journald_log_filter_patterns");
+ return 0;
+ }
+
+ r = set_make_nulstr(c->log_filter_allowed_patterns, &allowed_patterns, &allowed_patterns_len);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to make nulstr from set: %m");
+
+ r = set_make_nulstr(c->log_filter_denied_patterns, &denied_patterns, &denied_patterns_len);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to make nulstr from set: %m");
+
+ /* Use nul character separated strings without trailing nul */
+ allowed_patterns_len = LESS_BY(allowed_patterns_len, 1u);
+ denied_patterns_len = LESS_BY(denied_patterns_len, 1u);
+
+ len = allowed_patterns_len + 1 + denied_patterns_len;
+ patterns = new(char, len);
+ if (!patterns)
+ return log_oom_debug();
+
+ last = mempcpy_safe(patterns, allowed_patterns, allowed_patterns_len);
+ *(last++) = '\xff';
+ memcpy_safe(last, denied_patterns, denied_patterns_len);
+
+ unit_set_xattr_graceful(u, "user.journald_log_filter_patterns", patterns, len);
+
+ return 0;
+}
+
+static void cgroup_invocation_id_xattr_apply(Unit *u) {
+ bool b;
+
+ assert(u);
+
+ b = !sd_id128_is_null(u->invocation_id);
+ FOREACH_STRING(xn, "trusted.invocation_id", "user.invocation_id") {
+ if (b)
+ unit_set_xattr_graceful(u, xn, SD_ID128_TO_STRING(u->invocation_id), 32);
+ else
+ unit_remove_xattr_graceful(u, xn);
+ }
+}
+
+static void cgroup_coredump_xattr_apply(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return;
+
+ if (unit_cgroup_delegate(u) && c->coredump_receive)
+ unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1);
+ else
+ unit_remove_xattr_graceful(u, "user.coredump_receive");
+}
+
+static void cgroup_delegate_xattr_apply(Unit *u) {
+ bool b;
+
+ assert(u);
+
+ /* Indicate on the cgroup whether delegation is on, via an xattr. This is best-effort, as old kernels
+ * didn't support xattrs on cgroups at all. Later they got support for setting 'trusted.*' xattrs,
+ * and even later 'user.*' xattrs. We started setting this field when 'trusted.*' was added, and
+ * given this is now pretty much API, let's continue to support that. But also set 'user.*' as well,
+ * since it is readable by any user, not just CAP_SYS_ADMIN. This hence comes with slightly weaker
+ * security (as users who got delegated cgroups could turn it off if they like), but this shouldn't
+ * be a big problem given this communicates delegation state to clients, but the manager never reads
+ * it. */
+ b = unit_cgroup_delegate(u);
+ FOREACH_STRING(xn, "trusted.delegate", "user.delegate") {
+ if (b)
+ unit_set_xattr_graceful(u, xn, "1", 1);
+ else
+ unit_remove_xattr_graceful(u, xn);
+ }
+}
+
+static void cgroup_survive_xattr_apply(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->survive_final_kill_signal) {
+ r = cg_set_xattr(
+ u->cgroup_path,
+ "user.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ /* user xattr support was added in kernel v5.7 */
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ r = cg_set_xattr(
+ u->cgroup_path,
+ "trusted.survive_final_kill_signal",
+ "1",
+ 1,
+ /* flags= */ 0);
+ if (r < 0)
+ log_unit_debug_errno(u,
+ r,
+ "Failed to set 'survive_final_kill_signal' xattr on control "
+ "group %s, ignoring: %m",
+ empty_to_root(u->cgroup_path));
+ } else {
+ unit_remove_xattr_graceful(u, "user.survive_final_kill_signal");
+ unit_remove_xattr_graceful(u, "trusted.survive_final_kill_signal");
+ }
+}
+
+static void cgroup_xattr_apply(Unit *u) {
+ assert(u);
+
+ /* The 'user.*' xattrs can be set from a user manager. */
+ cgroup_oomd_xattr_apply(u);
+ cgroup_log_xattr_apply(u);
+ cgroup_coredump_xattr_apply(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ cgroup_invocation_id_xattr_apply(u);
+ cgroup_delegate_xattr_apply(u);
+ cgroup_survive_xattr_apply(u);
+}
+
+static int lookup_block_device(const char *p, dev_t *ret) {
+ dev_t rdev, dev = 0;
+ mode_t mode;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ r = device_path_parse_major_minor(p, &mode, &rdev);
+ if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
+ struct stat st;
+
+ if (stat(p, &st) < 0)
+ return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
+
+ mode = st.st_mode;
+ rdev = st.st_rdev;
+ dev = st.st_dev;
+ } else if (r < 0)
+ return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
+
+ if (S_ISCHR(mode))
+ return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
+ "Device node '%s' is a character device, but block device needed.", p);
+ if (S_ISBLK(mode))
+ *ret = rdev;
+ else if (major(dev) != 0)
+ *ret = dev; /* If this is not a device node then use the block device this file is stored on */
+ else {
+ /* If this is btrfs, getting the backing block device is a bit harder */
+ r = btrfs_get_block_device(p, ret);
+ if (r == -ENOTTY)
+ return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
+ "'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
+ }
+
+ /* If this is a LUKS/DM device, recursively try to get the originating block device */
+ while (block_get_originating(*ret, ret) > 0);
+
+ /* If this is a partition, try to get the originating block device */
+ (void) block_get_whole_disk(*ret, ret);
+ return 0;
+}
+
+static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
+ return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
+}
+
+static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
+ return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
+ c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
+}
+
+static bool cgroup_context_has_allowed_cpus(CGroupContext *c) {
+ return c->cpuset_cpus.set || c->startup_cpuset_cpus.set;
+}
+
+static bool cgroup_context_has_allowed_mems(CGroupContext *c) {
+ return c->cpuset_mems.set || c->startup_cpuset_mems.set;
+}
+
+uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
+ assert(c);
+
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
+ return c->startup_cpu_weight;
+ else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
+ return c->cpu_weight;
+ else
+ return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
+ return c->startup_cpu_shares;
+ else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
+ return c->cpu_shares;
+ else
+ return CGROUP_CPU_SHARES_DEFAULT;
+}
+
+static CPUSet *cgroup_context_allowed_cpus(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpuset_cpus.set)
+ return &c->startup_cpuset_cpus;
+ else
+ return &c->cpuset_cpus;
+}
+
+static CPUSet *cgroup_context_allowed_mems(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_cpuset_mems.set)
+ return &c->startup_cpuset_mems;
+ else
+ return &c->cpuset_mems;
+}
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
+ /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
+ * need to be higher than that boundary. quota is specified in USecPerSec.
+ * Additionally, period must be at most max_period. */
+ assert(quota > 0);
+
+ return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
+}
+
+static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
+ usec_t new_period;
+
+ if (quota == USEC_INFINITY)
+ /* Always use default period for infinity quota. */
+ return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+ if (period == USEC_INFINITY)
+ /* Default period was requested. */
+ period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
+
+ /* Clamp to interval [1ms, 1s] */
+ new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
+
+ if (new_period != period) {
+ log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
+ "Clamping CPU interval for cpu.max: period is now %s",
+ FORMAT_TIMESPAN(new_period, 1));
+ u->warned_clamping_cpu_quota_period = true;
+ }
+
+ return new_period;
+}
+
+static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+ if (weight == CGROUP_WEIGHT_IDLE)
+ return;
+ xsprintf(buf, "%" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
+}
+
+static void cgroup_apply_unified_cpu_idle(Unit *u, uint64_t weight) {
+ int r;
+ bool is_idle;
+ const char *idle_val;
+
+ is_idle = weight == CGROUP_WEIGHT_IDLE;
+ idle_val = one_zero(is_idle);
+ r = cg_set_attribute("cpu", u->cgroup_path, "cpu.idle", idle_val);
+ if (r < 0 && (r != -ENOENT || is_idle))
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%s': %m",
+ "cpu.idle", empty_to_root(u->cgroup_path), idle_val);
+}
+
+static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+ char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
+
+ period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+ if (quota != USEC_INFINITY)
+ xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
+ MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
+ else
+ xsprintf(buf, "max " USEC_FMT "\n", period);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
+}
+
+static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 2];
+
+ xsprintf(buf, "%" PRIu64 "\n", shares);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
+}
+
+static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
+ char buf[DECIMAL_STR_MAX(usec_t) + 2];
+
+ period = cgroup_cpu_adjust_period_and_log(u, period, quota);
+
+ xsprintf(buf, USEC_FMT "\n", period);
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
+
+ if (quota != USEC_INFINITY) {
+ xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
+ } else
+ (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
+}
+
+static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
+ return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
+ /* we don't support idle in cgroupv1 */
+ if (weight == CGROUP_WEIGHT_IDLE)
+ return CGROUP_CPU_SHARES_MIN;
+
+ return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+ CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
+}
+
+static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
+ _cleanup_free_ char *buf = NULL;
+
+ buf = cpu_set_to_range_string(cpus);
+ if (!buf) {
+ log_oom();
+ return;
+ }
+
+ (void) set_attribute_and_warn(u, "cpuset", name, buf);
+}
+
+static bool cgroup_context_has_io_config(CGroupContext *c) {
+ return c->io_accounting ||
+ c->io_weight != CGROUP_WEIGHT_INVALID ||
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+ c->io_device_weights ||
+ c->io_device_latencies ||
+ c->io_device_limits;
+}
+
+static bool cgroup_context_has_blockio_config(CGroupContext *c) {
+ return c->blockio_accounting ||
+ c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
+ c->blockio_device_weights ||
+ c->blockio_device_bandwidths;
+}
+
+static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_io_weight != CGROUP_WEIGHT_INVALID)
+ return c->startup_io_weight;
+ if (c->io_weight != CGROUP_WEIGHT_INVALID)
+ return c->io_weight;
+ return CGROUP_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
+ if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING) &&
+ c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+ return c->startup_blockio_weight;
+ if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
+ return c->blockio_weight;
+ return CGROUP_BLKIO_WEIGHT_DEFAULT;
+}
+
+static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
+ return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
+ return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
+ CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
+}
+
+static int set_bfq_weight(Unit *u, const char *controller, dev_t dev, uint64_t io_weight) {
+ static const char * const prop_names[] = {
+ "IOWeight",
+ "BlockIOWeight",
+ "IODeviceWeight",
+ "BlockIODeviceWeight",
+ };
+ static bool warned = false;
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+STRLEN("\n")];
+ const char *p;
+ uint64_t bfq_weight;
+ int r;
+
+ /* FIXME: drop this function when distro kernels properly support BFQ through "io.weight"
+ * See also: https://github.com/systemd/systemd/pull/13335 and
+ * https://github.com/torvalds/linux/commit/65752aef0a407e1ef17ec78a7fc31ba4e0b360f9. */
+ p = strjoina(controller, ".bfq.weight");
+ /* Adjust to kernel range is 1..1000, the default is 100. */
+ bfq_weight = BFQ_WEIGHT(io_weight);
+
+ if (major(dev) > 0)
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), bfq_weight);
+ else
+ xsprintf(buf, "%" PRIu64 "\n", bfq_weight);
+
+ r = cg_set_attribute(controller, u->cgroup_path, p, buf);
+
+ /* FIXME: drop this when kernels prior
+ * 795fe54c2a82 ("bfq: Add per-device weight") v5.4
+ * are not interesting anymore. Old kernels will fail with EINVAL, while new kernels won't return
+ * EINVAL on properly formatted input by us. Treat EINVAL accordingly. */
+ if (r == -EINVAL && major(dev) > 0) {
+ if (!warned) {
+ log_unit_warning(u, "Kernel version does not accept per-device setting in %s.", p);
+ warned = true;
+ }
+ r = -EOPNOTSUPP; /* mask as unconfigured device */
+ } else if (r >= 0 && io_weight != bfq_weight)
+ log_unit_debug(u, "%s=%" PRIu64 " scaled to %s=%" PRIu64,
+ prop_names[2*(major(dev) > 0) + streq(controller, "blkio")],
+ io_weight, p, bfq_weight);
+ return r;
+}
+
+static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r, r1, r2;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ r1 = set_bfq_weight(u, "io", dev, io_weight);
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), io_weight);
+ r2 = cg_set_attribute("io", u->cgroup_path, "io.weight", buf);
+
+ /* Look at the configured device, when both fail, prefer io.weight errno. */
+ r = r2 == -EOPNOTSUPP ? r1 : r2;
+
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r),
+ r, "Failed to set 'io[.bfq].weight' attribute on '%s' to '%.*s': %m",
+ empty_to_root(u->cgroup_path), (int) strcspn(buf, NEWLINE), buf);
+}
+
+static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r;
+
+ r = lookup_block_device(dev_path, &dev);
+ if (r < 0)
+ return;
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), blkio_weight);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
+}
+
+static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+ int r;
+
+ r = lookup_block_device(dev_path, &dev);
+ if (r < 0)
+ return;
+
+ if (target != USEC_INFINITY)
+ xsprintf(buf, DEVNUM_FORMAT_STR " target=%" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), target);
+ else
+ xsprintf(buf, DEVNUM_FORMAT_STR " target=max\n", DEVNUM_FORMAT_VAL(dev));
+
+ (void) set_attribute_and_warn(u, "io", "io.latency", buf);
+}
+
+static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
+ char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)],
+ buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
+ dev_t dev;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ if (limits[type] != cgroup_io_limit_defaults[type])
+ xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
+ else
+ xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
+
+ xsprintf(buf, DEVNUM_FORMAT_STR " rbps=%s wbps=%s riops=%s wiops=%s\n", DEVNUM_FORMAT_VAL(dev),
+ limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
+ limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
+ (void) set_attribute_and_warn(u, "io", "io.max", buf);
+}
+
+static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
+ char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
+ dev_t dev;
+
+ if (lookup_block_device(dev_path, &dev) < 0)
+ return;
+
+ sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), rbps);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
+
+ sprintf(buf, DEVNUM_FORMAT_STR " %" PRIu64 "\n", DEVNUM_FORMAT_VAL(dev), wbps);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
+}
+
+static bool unit_has_unified_memory_config(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ return unit_get_ancestor_memory_min(u) > 0 ||
+ unit_get_ancestor_memory_low(u) > 0 || unit_get_ancestor_startup_memory_low(u) > 0 ||
+ c->memory_high != CGROUP_LIMIT_MAX || c->startup_memory_high_set ||
+ c->memory_max != CGROUP_LIMIT_MAX || c->startup_memory_max_set ||
+ c->memory_swap_max != CGROUP_LIMIT_MAX || c->startup_memory_swap_max_set ||
+ c->memory_zswap_max != CGROUP_LIMIT_MAX || c->startup_memory_zswap_max_set;
+}
+
+static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
+
+ if (v != CGROUP_LIMIT_MAX)
+ xsprintf(buf, "%" PRIu64 "\n", v);
+
+ (void) set_attribute_and_warn(u, "memory", file, buf);
+}
+
+static void cgroup_apply_firewall(Unit *u) {
+ assert(u);
+
+ /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
+
+ if (bpf_firewall_compile(u) < 0)
+ return;
+
+ (void) bpf_firewall_load_custom(u);
+ (void) bpf_firewall_install(u);
+}
+
+void unit_modify_nft_set(Unit *u, bool add) {
+ int r;
+
+ assert(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ if (cg_all_unified() <= 0)
+ return;
+
+ if (u->cgroup_id == 0)
+ return;
+
+ if (!u->manager->fw_ctx) {
+ r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
+ if (r < 0)
+ return;
+
+ assert(u->manager->fw_ctx);
+ }
+
+ CGroupContext *c = ASSERT_PTR(unit_get_cgroup_context(u));
+
+ FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+ if (nft_set->source != NFT_SET_SOURCE_CGROUP)
+ continue;
+
+ uint64_t element = u->cgroup_id;
+
+ r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
+ if (r < 0)
+ log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, cgroup %" PRIu64 ", ignoring: %m",
+ add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+ else
+ log_debug("%s NFT set: family %s, table %s, set %s, cgroup %" PRIu64,
+ add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, u->cgroup_id);
+ }
+}
+
+static void cgroup_apply_socket_bind(Unit *u) {
+ assert(u);
+
+ (void) bpf_socket_bind_install(u);
+}
+
+static void cgroup_apply_restrict_network_interfaces(Unit *u) {
+ assert(u);
+
+ (void) restrict_network_interfaces_install(u);
+}
+
+static int cgroup_apply_devices(Unit *u) {
+ _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
+ const char *path;
+ CGroupContext *c;
+ CGroupDevicePolicy policy;
+ int r;
+
+ assert_se(c = unit_get_cgroup_context(u));
+ assert_se(path = u->cgroup_path);
+
+ policy = c->device_policy;
+
+ if (cg_all_unified() > 0) {
+ r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
+ if (r < 0)
+ return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
+
+ } else {
+ /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
+ * EINVAL here. */
+
+ if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
+ r = cg_set_attribute("devices", path, "devices.deny", "a");
+ else
+ r = cg_set_attribute("devices", path, "devices.allow", "a");
+ if (r < 0)
+ log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to reset devices.allow/devices.deny: %m");
+ }
+
+ bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
+ (policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
+ if (allow_list_static)
+ (void) bpf_devices_allow_list_static(prog, path);
+
+ bool any = allow_list_static;
+ LIST_FOREACH(device_allow, a, c->device_allow) {
+ const char *val;
+
+ if (a->permissions == 0)
+ continue;
+
+ if (path_startswith(a->path, "/dev/"))
+ r = bpf_devices_allow_list_device(prog, path, a->path, a->permissions);
+ else if ((val = startswith(a->path, "block-")))
+ r = bpf_devices_allow_list_major(prog, path, val, 'b', a->permissions);
+ else if ((val = startswith(a->path, "char-")))
+ r = bpf_devices_allow_list_major(prog, path, val, 'c', a->permissions);
+ else {
+ log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
+ continue;
+ }
+
+ if (r >= 0)
+ any = true;
+ }
+
+ if (prog && !any) {
+ log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
+
+ /* The kernel verifier would reject a program we would build with the normal intro and outro
+ but no allow-listing rules (outro would contain an unreachable instruction for successful
+ return). */
+ policy = CGROUP_DEVICE_POLICY_STRICT;
+ }
+
+ r = bpf_devices_apply_policy(&prog, policy, any, path, &u->bpf_device_control_installed);
+ if (r < 0) {
+ static bool warned = false;
+
+ log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
+ "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
+ "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
+ "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
+
+ warned = true;
+ }
+ return r;
+}
+
+static void set_io_weight(Unit *u, uint64_t weight) {
+ char buf[STRLEN("default \n")+DECIMAL_STR_MAX(uint64_t)];
+
+ assert(u);
+
+ (void) set_bfq_weight(u, "io", makedev(0, 0), weight);
+
+ xsprintf(buf, "default %" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "io", "io.weight", buf);
+}
+
+static void set_blkio_weight(Unit *u, uint64_t weight) {
+ char buf[STRLEN("\n")+DECIMAL_STR_MAX(uint64_t)];
+
+ assert(u);
+
+ (void) set_bfq_weight(u, "blkio", makedev(0, 0), weight);
+
+ xsprintf(buf, "%" PRIu64 "\n", weight);
+ (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
+}
+
+static void cgroup_apply_bpf_foreign_program(Unit *u) {
+ assert(u);
+
+ (void) bpf_foreign_install(u);
+}
+
+static void cgroup_context_apply(
+ Unit *u,
+ CGroupMask apply_mask,
+ ManagerState state) {
+
+ const char *path;
+ CGroupContext *c;
+ bool is_host_root, is_local_root;
+ int r;
+
+ assert(u);
+
+ /* Nothing to do? Exit early! */
+ if (apply_mask == 0)
+ return;
+
+ /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
+ * attributes should only be managed for cgroups further down the tree. */
+ is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
+ is_host_root = unit_has_host_root_cgroup(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+ assert_se(path = u->cgroup_path);
+
+ if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
+ path = "/";
+
+ /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
+ * then), and missing cgroups, i.e. EROFS and ENOENT. */
+
+ /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
+ * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
+ * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
+ * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
+ * we couldn't even write to them if we wanted to). */
+ if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
+
+ if (cg_all_unified() > 0) {
+ uint64_t weight;
+
+ if (cgroup_context_has_cpu_weight(c))
+ weight = cgroup_context_cpu_weight(c, state);
+ else if (cgroup_context_has_cpu_shares(c)) {
+ uint64_t shares;
+
+ shares = cgroup_context_cpu_shares(c, state);
+ weight = cgroup_cpu_shares_to_weight(shares);
+
+ log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
+ shares, weight, path);
+ } else
+ weight = CGROUP_WEIGHT_DEFAULT;
+
+ cgroup_apply_unified_cpu_idle(u, weight);
+ cgroup_apply_unified_cpu_weight(u, weight);
+ cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+
+ } else {
+ uint64_t shares;
+
+ if (cgroup_context_has_cpu_weight(c)) {
+ uint64_t weight;
+
+ weight = cgroup_context_cpu_weight(c, state);
+ shares = cgroup_cpu_weight_to_shares(weight);
+
+ log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
+ weight, shares, path);
+ } else if (cgroup_context_has_cpu_shares(c))
+ shares = cgroup_context_cpu_shares(c, state);
+ else
+ shares = CGROUP_CPU_SHARES_DEFAULT;
+
+ cgroup_apply_legacy_cpu_shares(u, shares);
+ cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
+ }
+ }
+
+ if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
+ cgroup_apply_unified_cpuset(u, cgroup_context_allowed_cpus(c, state), "cpuset.cpus");
+ cgroup_apply_unified_cpuset(u, cgroup_context_allowed_mems(c, state), "cpuset.mems");
+ }
+
+ /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
+ * controller), and in case of containers we want to leave control of these attributes to the container manager
+ * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
+ if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
+ bool has_io, has_blockio;
+ uint64_t weight;
+
+ has_io = cgroup_context_has_io_config(c);
+ has_blockio = cgroup_context_has_blockio_config(c);
+
+ if (has_io)
+ weight = cgroup_context_io_weight(c, state);
+ else if (has_blockio) {
+ uint64_t blkio_weight;
+
+ blkio_weight = cgroup_context_blkio_weight(c, state);
+ weight = cgroup_weight_blkio_to_io(blkio_weight);
+
+ log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
+ blkio_weight, weight);
+ } else
+ weight = CGROUP_WEIGHT_DEFAULT;
+
+ set_io_weight(u, weight);
+
+ if (has_io) {
+ LIST_FOREACH(device_weights, w, c->io_device_weights)
+ cgroup_apply_io_device_weight(u, w->path, w->weight);
+
+ LIST_FOREACH(device_limits, limit, c->io_device_limits)
+ cgroup_apply_io_device_limit(u, limit->path, limit->limits);
+
+ LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
+ cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
+
+ } else if (has_blockio) {
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+ weight = cgroup_weight_blkio_to_io(w->weight);
+
+ log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
+ w->weight, weight, w->path);
+
+ cgroup_apply_io_device_weight(u, w->path, weight);
+ }
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ limits[type] = cgroup_io_limit_defaults[type];
+
+ limits[CGROUP_IO_RBPS_MAX] = b->rbps;
+ limits[CGROUP_IO_WBPS_MAX] = b->wbps;
+
+ log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
+ b->rbps, b->wbps, b->path);
+
+ cgroup_apply_io_device_limit(u, b->path, limits);
+ }
+ }
+ }
+
+ if (apply_mask & CGROUP_MASK_BLKIO) {
+ bool has_io, has_blockio;
+
+ has_io = cgroup_context_has_io_config(c);
+ has_blockio = cgroup_context_has_blockio_config(c);
+
+ /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
+ * left to our container manager, too. */
+ if (!is_local_root) {
+ uint64_t weight;
+
+ if (has_io) {
+ uint64_t io_weight;
+
+ io_weight = cgroup_context_io_weight(c, state);
+ weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
+
+ log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
+ io_weight, weight);
+ } else if (has_blockio)
+ weight = cgroup_context_blkio_weight(c, state);
+ else
+ weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
+
+ set_blkio_weight(u, weight);
+
+ if (has_io)
+ LIST_FOREACH(device_weights, w, c->io_device_weights) {
+ weight = cgroup_weight_io_to_blkio(w->weight);
+
+ log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
+ w->weight, weight, w->path);
+
+ cgroup_apply_blkio_device_weight(u, w->path, weight);
+ }
+ else if (has_blockio)
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights)
+ cgroup_apply_blkio_device_weight(u, w->path, w->weight);
+ }
+
+ /* The bandwidth limits are something that make sense to be applied to the host's root but not container
+ * roots, as there we want the container manager to handle it */
+ if (is_host_root || !is_local_root) {
+ if (has_io)
+ LIST_FOREACH(device_limits, l, c->io_device_limits) {
+ log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
+ l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
+
+ cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
+ }
+ else if (has_blockio)
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
+ }
+ }
+
+ /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
+ * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
+ * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
+ * write to this if we wanted to.) */
+ if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
+
+ if (cg_all_unified() > 0) {
+ uint64_t max, swap_max = CGROUP_LIMIT_MAX, zswap_max = CGROUP_LIMIT_MAX, high = CGROUP_LIMIT_MAX;
+
+ if (unit_has_unified_memory_config(u)) {
+ bool startup = IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING, MANAGER_STOPPING);
+
+ high = startup && c->startup_memory_high_set ? c->startup_memory_high : c->memory_high;
+ max = startup && c->startup_memory_max_set ? c->startup_memory_max : c->memory_max;
+ swap_max = startup && c->startup_memory_swap_max_set ? c->startup_memory_swap_max : c->memory_swap_max;
+ zswap_max = startup && c->startup_memory_zswap_max_set ? c->startup_memory_zswap_max : c->memory_zswap_max;
+ } else {
+ max = c->memory_limit;
+
+ if (max != CGROUP_LIMIT_MAX)
+ log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
+ }
+
+ cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
+ cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
+ cgroup_apply_unified_memory_limit(u, "memory.high", high);
+ cgroup_apply_unified_memory_limit(u, "memory.max", max);
+ cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
+ cgroup_apply_unified_memory_limit(u, "memory.zswap.max", zswap_max);
+
+ (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
+
+ } else {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+ uint64_t val;
+
+ if (unit_has_unified_memory_config(u)) {
+ val = c->memory_max;
+ if (val != CGROUP_LIMIT_MAX)
+ log_cgroup_compat(u, "Applying MemoryMax=%" PRIu64 " as MemoryLimit=", val);
+ } else
+ val = c->memory_limit;
+
+ if (val == CGROUP_LIMIT_MAX)
+ strncpy(buf, "-1\n", sizeof(buf));
+ else
+ xsprintf(buf, "%" PRIu64 "\n", val);
+
+ (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
+ }
+ }
+
+ /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
+ * containers, where we leave this to the manager */
+ if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
+ (is_host_root || cg_all_unified() > 0 || !is_local_root))
+ (void) cgroup_apply_devices(u);
+
+ if (apply_mask & CGROUP_MASK_PIDS) {
+
+ if (is_host_root) {
+ /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
+ * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
+ * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
+ * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
+ * exclusive ownership of the sysctls, but we still want to honour things if the user sets
+ * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
+ * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
+ * it also counts. But if the user never set a limit through us (i.e. we are the default of
+ * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
+ * the first time we set a limit. Note that this boolean is flushed out on manager reload,
+ * which is desirable so that there's an official way to release control of the sysctl from
+ * systemd: set the limit to unbounded and reload. */
+
+ if (cgroup_tasks_max_isset(&c->tasks_max)) {
+ u->manager->sysctl_pid_max_changed = true;
+ r = procfs_tasks_set_limit(cgroup_tasks_max_resolve(&c->tasks_max));
+ } else if (u->manager->sysctl_pid_max_changed)
+ r = procfs_tasks_set_limit(TASKS_MAX);
+ else
+ r = 0;
+ if (r < 0)
+ log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
+ "Failed to write to tasks limit sysctls: %m");
+ }
+
+ /* The attribute itself is not available on the host root cgroup, and in the container case we want to
+ * leave it for the container manager. */
+ if (!is_local_root) {
+ if (cgroup_tasks_max_isset(&c->tasks_max)) {
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+
+ xsprintf(buf, "%" PRIu64 "\n", cgroup_tasks_max_resolve(&c->tasks_max));
+ (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
+ } else
+ (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
+ }
+ }
+
+ if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
+ cgroup_apply_firewall(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_FOREIGN)
+ cgroup_apply_bpf_foreign_program(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_SOCKET_BIND)
+ cgroup_apply_socket_bind(u);
+
+ if (apply_mask & CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES)
+ cgroup_apply_restrict_network_interfaces(u);
+
+ unit_modify_nft_set(u, /* add = */ true);
+}
+
+static bool unit_get_needs_bpf_firewall(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ if (c->ip_accounting ||
+ !set_isempty(c->ip_address_allow) ||
+ !set_isempty(c->ip_address_deny) ||
+ c->ip_filters_ingress ||
+ c->ip_filters_egress)
+ return true;
+
+ /* If any parent slice has an IP access list defined, it applies too */
+ for (Unit *p = UNIT_GET_SLICE(u); p; p = UNIT_GET_SLICE(p)) {
+ c = unit_get_cgroup_context(p);
+ if (!c)
+ return false;
+
+ if (!set_isempty(c->ip_address_allow) ||
+ !set_isempty(c->ip_address_deny))
+ return true;
+ }
+
+ return false;
+}
+
+static bool unit_get_needs_bpf_foreign_program(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return !!c->bpf_foreign_programs;
+}
+
+static bool unit_get_needs_socket_bind(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->socket_bind_allow || c->socket_bind_deny;
+}
+
+static bool unit_get_needs_restrict_network_interfaces(Unit *u) {
+ CGroupContext *c;
+ assert(u);
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return !set_isempty(c->restrict_network_interfaces);
+}
+
+static CGroupMask unit_get_cgroup_mask(Unit *u) {
+ CGroupMask mask = 0;
+ CGroupContext *c;
+
+ assert(u);
+
+ assert_se(c = unit_get_cgroup_context(u));
+
+ /* Figure out which controllers we need, based on the cgroup context object */
+
+ if (c->cpu_accounting)
+ mask |= get_cpu_accounting_mask();
+
+ if (cgroup_context_has_cpu_weight(c) ||
+ cgroup_context_has_cpu_shares(c) ||
+ c->cpu_quota_per_sec_usec != USEC_INFINITY)
+ mask |= CGROUP_MASK_CPU;
+
+ if (cgroup_context_has_allowed_cpus(c) || cgroup_context_has_allowed_mems(c))
+ mask |= CGROUP_MASK_CPUSET;
+
+ if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
+ mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+ if (c->memory_accounting ||
+ c->memory_limit != CGROUP_LIMIT_MAX ||
+ unit_has_unified_memory_config(u))
+ mask |= CGROUP_MASK_MEMORY;
+
+ if (c->device_allow ||
+ c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
+ mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
+
+ if (c->tasks_accounting ||
+ cgroup_tasks_max_isset(&c->tasks_max))
+ mask |= CGROUP_MASK_PIDS;
+
+ return CGROUP_MASK_EXTEND_JOINED(mask);
+}
+
+static CGroupMask unit_get_bpf_mask(Unit *u) {
+ CGroupMask mask = 0;
+
+ /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
+ * too. */
+
+ if (unit_get_needs_bpf_firewall(u))
+ mask |= CGROUP_MASK_BPF_FIREWALL;
+
+ if (unit_get_needs_bpf_foreign_program(u))
+ mask |= CGROUP_MASK_BPF_FOREIGN;
+
+ if (unit_get_needs_socket_bind(u))
+ mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+ if (unit_get_needs_restrict_network_interfaces(u))
+ mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+ return mask;
+}
+
+CGroupMask unit_get_own_mask(Unit *u) {
+ CGroupContext *c;
+
+ /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
+ * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
+}
+
+CGroupMask unit_get_delegate_mask(Unit *u) {
+ CGroupContext *c;
+
+ /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
+ * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
+ *
+ * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
+
+ if (!unit_cgroup_delegate(u))
+ return 0;
+
+ if (cg_all_unified() <= 0) {
+ ExecContext *e;
+
+ e = unit_get_exec_context(u);
+ if (e && !exec_context_maintains_privileges(e))
+ return 0;
+ }
+
+ assert_se(c = unit_get_cgroup_context(u));
+ return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
+}
+
+static CGroupMask unit_get_subtree_mask(Unit *u) {
+
+ /* Returns the mask of this subtree, meaning of the group
+ * itself and its children. */
+
+ return unit_get_own_mask(u) | unit_get_members_mask(u);
+}
+
+CGroupMask unit_get_members_mask(Unit *u) {
+ assert(u);
+
+ /* Returns the mask of controllers all of the unit's children require, merged */
+
+ if (u->cgroup_members_mask_valid)
+ return u->cgroup_members_mask; /* Use cached value if possible */
+
+ u->cgroup_members_mask = 0;
+
+ if (u->type == UNIT_SLICE) {
+ Unit *member;
+
+ UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+ u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
+ }
+
+ u->cgroup_members_mask_valid = true;
+ return u->cgroup_members_mask;
+}
+
+CGroupMask unit_get_siblings_mask(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ /* Returns the mask of controllers all of the unit's siblings
+ * require, i.e. the members mask of the unit's parent slice
+ * if there is one. */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ return unit_get_members_mask(slice);
+
+ return unit_get_subtree_mask(u); /* we are the top-level slice */
+}
+
+static CGroupMask unit_get_disable_mask(Unit *u) {
+ CGroupContext *c;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ return c->disable_controllers;
+}
+
+CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
+ CGroupMask mask;
+ Unit *slice;
+
+ assert(u);
+ mask = unit_get_disable_mask(u);
+
+ /* Returns the mask of controllers which are marked as forcibly
+ * disabled in any ancestor unit or the unit in question. */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ mask |= unit_get_ancestor_disable_mask(slice);
+
+ return mask;
+}
+
+CGroupMask unit_get_target_mask(Unit *u) {
+ CGroupMask own_mask, mask;
+
+ /* This returns the cgroup mask of all controllers to enable for a specific cgroup, i.e. everything
+ * it needs itself, plus all that its children need, plus all that its siblings need. This is
+ * primarily useful on the legacy cgroup hierarchy, where we need to duplicate each cgroup in each
+ * hierarchy that shall be enabled for it. */
+
+ own_mask = unit_get_own_mask(u);
+
+ if (own_mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
+ emit_bpf_firewall_warning(u);
+
+ mask = own_mask | unit_get_members_mask(u) | unit_get_siblings_mask(u);
+
+ mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
+
+ return mask;
+}
+
+CGroupMask unit_get_enable_mask(Unit *u) {
+ CGroupMask mask;
+
+ /* This returns the cgroup mask of all controllers to enable
+ * for the children of a specific cgroup. This is primarily
+ * useful for the unified cgroup hierarchy, where each cgroup
+ * controls which controllers are enabled for its children. */
+
+ mask = unit_get_members_mask(u);
+ mask &= u->manager->cgroup_supported;
+ mask &= ~unit_get_ancestor_disable_mask(u);
+
+ return mask;
+}
+
+void unit_invalidate_cgroup_members_masks(Unit *u) {
+ Unit *slice;
+
+ assert(u);
+
+ /* Recurse invalidate the member masks cache all the way up the tree */
+ u->cgroup_members_mask_valid = false;
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ unit_invalidate_cgroup_members_masks(slice);
+}
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
+
+ /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
+
+ while (u) {
+
+ if (u->cgroup_path &&
+ u->cgroup_realized &&
+ FLAGS_SET(u->cgroup_realized_mask, mask))
+ return u->cgroup_path;
+
+ u = UNIT_GET_SLICE(u);
+ }
+
+ return NULL;
+}
+
+static const char *migrate_callback(CGroupMask mask, void *userdata) {
+ /* If not realized at all, migrate to root ("").
+ * It may happen if we're upgrading from older version that didn't clean up.
+ */
+ return strempty(unit_get_realized_cgroup_path(userdata, mask));
+}
+
+int unit_default_cgroup_path(const Unit *u, char **ret) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ p = strdup(u->manager->cgroup_root);
+ else {
+ _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
+ Unit *slice;
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
+ r = cg_slice_to_path(slice->id, &slice_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = cg_escape(u->id, &escaped);
+ if (r < 0)
+ return r;
+
+ p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
+ }
+ if (!p)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(p);
+ return 0;
+}
+
+int unit_set_cgroup_path(Unit *u, const char *path) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+
+ if (streq_ptr(u->cgroup_path, path))
+ return 0;
+
+ if (path) {
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+ }
+
+ if (p) {
+ r = hashmap_put(u->manager->cgroup_unit, p, u);
+ if (r < 0)
+ return r;
+ }
+
+ unit_release_cgroup(u);
+ u->cgroup_path = TAKE_PTR(p);
+
+ return 1;
+}
+
+int unit_watch_cgroup(Unit *u) {
+ _cleanup_free_ char *events = NULL;
+ int r;
+
+ assert(u);
+
+ /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
+ * cgroupv2 is available. */
+
+ if (!u->cgroup_path)
+ return 0;
+
+ if (u->cgroup_control_inotify_wd >= 0)
+ return 0;
+
+ /* Only applies to the unified hierarchy */
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
+ if (r == 0)
+ return 0;
+
+ /* No point in watch the top-level slice, it's never going to run empty. */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
+ if (r < 0)
+ return log_oom();
+
+ u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (u->cgroup_control_inotify_wd < 0) {
+
+ if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+ * is not an error */
+ return 0;
+
+ return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ }
+
+ r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+ return 0;
+}
+
+int unit_watch_cgroup_memory(Unit *u) {
+ _cleanup_free_ char *events = NULL;
+ CGroupContext *c;
+ int r;
+
+ assert(u);
+
+ /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
+ * cgroupv2 is available. */
+
+ if (!u->cgroup_path)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
+ * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
+ * all. */
+ if (!c->memory_accounting)
+ return 0;
+
+ /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
+ * we also don't want to generate a log message for each parent cgroup of a process. */
+ if (u->type == UNIT_SLICE)
+ return 0;
+
+ if (u->cgroup_memory_inotify_wd >= 0)
+ return 0;
+
+ /* Only applies to the unified hierarchy */
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
+ if (r == 0)
+ return 0;
+
+ r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
+ if (r < 0)
+ return log_oom();
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
+ if (r < 0)
+ return log_oom();
+
+ u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+ if (u->cgroup_memory_inotify_wd < 0) {
+
+ if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+ * is not an error */
+ return 0;
+
+ return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", empty_to_root(u->cgroup_path));
+ }
+
+ r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor for control group %s to hash map: %m", empty_to_root(u->cgroup_path));
+
+ return 0;
+}
+
+int unit_pick_cgroup_path(Unit *u) {
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(u);
+
+ if (u->cgroup_path)
+ return 0;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ r = unit_default_cgroup_path(u, &path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
+
+ r = unit_set_cgroup_path(u, path);
+ if (r == -EEXIST)
+ return log_unit_error_errno(u, r, "Control group %s exists already.", empty_to_root(path));
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", empty_to_root(path));
+
+ return 0;
+}
+
+static int unit_update_cgroup(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask,
+ ManagerState state) {
+
+ bool created, is_root_slice;
+ CGroupMask migrate_mask = 0;
+ _cleanup_free_ char *cgroup_full_path = NULL;
+ int r;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* Figure out our cgroup path */
+ r = unit_pick_cgroup_path(u);
+ if (r < 0)
+ return r;
+
+ /* First, create our own group */
+ r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
+ created = r;
+
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+ uint64_t cgroup_id = 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
+ if (r == 0) {
+ r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
+ if (r < 0)
+ log_unit_full_errno(u, ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to get cgroup ID of cgroup %s, ignoring: %m", cgroup_full_path);
+ } else
+ log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ u->cgroup_id = cgroup_id;
+ }
+
+ /* Start watching it */
+ (void) unit_watch_cgroup(u);
+ (void) unit_watch_cgroup_memory(u);
+
+ /* For v2 we preserve enabled controllers in delegated units, adjust others,
+ * for v1 we figure out which controller hierarchies need migration. */
+ if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
+ CGroupMask result_mask = 0;
+
+ /* Enable all controllers we need */
+ r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ /* Remember what's actually enabled now */
+ u->cgroup_enabled_mask = result_mask;
+
+ migrate_mask = u->cgroup_realized_mask ^ target_mask;
+ }
+
+ /* Keep track that this is now realized */
+ u->cgroup_realized = true;
+ u->cgroup_realized_mask = target_mask;
+
+ /* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
+ *
+ * Unnecessary controller cgroups are trimmed (after emptied by upward migration).
+ * We perform migration also with whole slices for cases when users don't care about leave
+ * granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
+ * delegated units.
+ */
+ if (cg_all_unified() == 0) {
+ r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to migrate controller cgroups from %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+ r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to delete controller cgroups %s, ignoring: %m", empty_to_root(u->cgroup_path));
+ }
+
+ /* Set attributes */
+ cgroup_context_apply(u, target_mask, state);
+ cgroup_xattr_apply(u);
+
+ /* For most units we expect that memory monitoring is set up before the unit is started and we won't
+ * touch it after. For PID 1 this is different though, because we couldn't possibly do that given
+ * that PID 1 runs before init.scope is even set up. Hence, whenever init.scope is realized, let's
+ * try to open the memory pressure interface anew. */
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+ (void) manager_setup_memory_pressure_event_source(u->manager);
+
+ return 0;
+}
+
+static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ char *pp;
+ int r;
+
+ assert(u);
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ return -EINVAL;
+
+ if (!u->manager->system_bus)
+ return -EIO;
+
+ if (!u->cgroup_path)
+ return -EINVAL;
+
+ /* Determine this unit's cgroup path relative to our cgroup root */
+ pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
+ if (!pp)
+ return -EINVAL;
+
+ pp = strjoina("/", pp, suffix_path);
+ path_simplify(pp);
+
+ r = bus_call_method(u->manager->system_bus,
+ bus_systemd_mgr,
+ "AttachProcessesToUnit",
+ &error, NULL,
+ "ssau",
+ NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
+
+ return 0;
+}
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
+ _cleanup_free_ char *joined = NULL;
+ CGroupMask delegated_mask;
+ const char *p;
+ PidRef *pid;
+ int ret, r;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EINVAL;
+
+ if (set_isempty(pids))
+ return 0;
+
+ /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+ * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+ r = bpf_firewall_load_custom(u);
+ if (r < 0)
+ return r;
+
+ r = unit_realize_cgroup(u);
+ if (r < 0)
+ return r;
+
+ if (isempty(suffix_path))
+ p = u->cgroup_path;
+ else {
+ joined = path_join(u->cgroup_path, suffix_path);
+ if (!joined)
+ return -ENOMEM;
+
+ p = joined;
+ }
+
+ delegated_mask = unit_get_delegate_mask(u);
+
+ ret = 0;
+ SET_FOREACH(pid, pids) {
+
+ /* Unfortunately we cannot add pids by pidfd to a cgroup. Hence we have to use PIDs instead,
+ * which of course is racy. Let's shorten the race a bit though, and re-validate the PID
+ * before we use it */
+ r = pidref_verify(pid);
+ if (r < 0) {
+ log_unit_info_errno(u, r, "PID " PID_FMT " vanished before we could move it to target cgroup '%s', skipping: %m", pid->pid, empty_to_root(p));
+ continue;
+ }
+
+ /* First, attach the PID to the main cgroup hierarchy */
+ r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid->pid);
+ if (r < 0) {
+ bool again = MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(r);
+
+ log_unit_full_errno(u, again ? LOG_DEBUG : LOG_INFO, r,
+ "Couldn't move process "PID_FMT" to%s requested cgroup '%s': %m",
+ pid->pid, again ? " directly" : "", empty_to_root(p));
+
+ if (again) {
+ int z;
+
+ /* If we are in a user instance, and we can't move the process ourselves due
+ * to permission problems, let's ask the system instance about it instead.
+ * Since it's more privileged it might be able to move the process across the
+ * leaves of a subtree whose top node is not owned by us. */
+
+ z = unit_attach_pid_to_cgroup_via_bus(u, pid->pid, suffix_path);
+ if (z < 0)
+ log_unit_info_errno(u, z, "Couldn't move process "PID_FMT" to requested cgroup '%s' (directly or via the system bus): %m", pid->pid, empty_to_root(p));
+ else {
+ if (ret >= 0)
+ ret++; /* Count successful additions */
+ continue; /* When the bus thing worked via the bus we are fully done for this PID. */
+ }
+ }
+
+ if (ret >= 0)
+ ret = r; /* Remember first error */
+
+ continue;
+ } else if (ret >= 0)
+ ret++; /* Count successful additions */
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ continue;
+
+ /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
+ * innermost realized one */
+
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *realized;
+
+ if (!(u->manager->cgroup_supported & bit))
+ continue;
+
+ /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
+ if (delegated_mask & u->cgroup_realized_mask & bit) {
+ r = cg_attach(cgroup_controller_to_string(c), p, pid->pid);
+ if (r >= 0)
+ continue; /* Success! */
+
+ log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
+ pid->pid, empty_to_root(p), cgroup_controller_to_string(c));
+ }
+
+ /* So this controller is either not delegate or realized, or something else weird happened. In
+ * that case let's attach the PID at least to the closest cgroup up the tree that is
+ * realized. */
+ realized = unit_get_realized_cgroup_path(u, bit);
+ if (!realized)
+ continue; /* Not even realized in the root slice? Then let's not bother */
+
+ r = cg_attach(cgroup_controller_to_string(c), realized, pid->pid);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
+ pid->pid, realized, cgroup_controller_to_string(c));
+ }
+ }
+
+ return ret;
+}
+
+static bool unit_has_mask_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if this unit is fully realized. We check four things:
+ *
+ * 1. Whether the cgroup was created at all
+ * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
+ * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
+ * 4. Whether the invalidation mask is currently zero
+ *
+ * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
+ * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
+ * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
+ * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
+ * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
+ * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
+ * simply don't matter. */
+
+ return u->cgroup_realized &&
+ ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
+ ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
+ u->cgroup_invalidated_mask == 0;
+}
+
+static bool unit_has_mask_disables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be disabled are indeed disabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
+ * already removed. */
+
+ return !u->cgroup_realized ||
+ (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
+ FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
+}
+
+static bool unit_has_mask_enables_realized(
+ Unit *u,
+ CGroupMask target_mask,
+ CGroupMask enable_mask) {
+
+ assert(u);
+
+ /* Returns true if all controllers which should be enabled are indeed enabled.
+ *
+ * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
+ * we want to add is already added. */
+
+ return u->cgroup_realized &&
+ ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
+ ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
+}
+
+void unit_add_to_cgroup_realize_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_cgroup_realize_queue)
+ return;
+
+ LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+ u->in_cgroup_realize_queue = true;
+}
+
+static void unit_remove_from_cgroup_realize_queue(Unit *u) {
+ assert(u);
+
+ if (!u->in_cgroup_realize_queue)
+ return;
+
+ LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+ u->in_cgroup_realize_queue = false;
+}
+
+/* Controllers can only be enabled breadth-first, from the root of the
+ * hierarchy downwards to the unit in question. */
+static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ /* First go deal with this unit's parent, or we won't be able to enable
+ * any new controllers at this layer. */
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ r = unit_realize_cgroup_now_enable(slice, state);
+ if (r < 0)
+ return r;
+ }
+
+ target_mask = unit_get_target_mask(u);
+ enable_mask = unit_get_enable_mask(u);
+
+ /* We can only enable in this direction, don't try to disable anything.
+ */
+ if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
+ return 0;
+
+ new_target_mask = u->cgroup_realized_mask | target_mask;
+ new_enable_mask = u->cgroup_enabled_mask | enable_mask;
+
+ return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
+}
+
+/* Controllers can only be disabled depth-first, from the leaves of the
+ * hierarchy upwards to the unit in question. */
+static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
+ Unit *m;
+
+ assert(u);
+
+ if (u->type != UNIT_SLICE)
+ return 0;
+
+ UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+ CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
+ int r;
+
+ /* The cgroup for this unit might not actually be fully realised yet, in which case it isn't
+ * holding any controllers open anyway. */
+ if (!m->cgroup_realized)
+ continue;
+
+ /* We must disable those below us first in order to release the controller. */
+ if (m->type == UNIT_SLICE)
+ (void) unit_realize_cgroup_now_disable(m, state);
+
+ target_mask = unit_get_target_mask(m);
+ enable_mask = unit_get_enable_mask(m);
+
+ /* We can only disable in this direction, don't try to enable anything. */
+ if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
+ continue;
+
+ new_target_mask = m->cgroup_realized_mask & target_mask;
+ new_enable_mask = m->cgroup_enabled_mask & enable_mask;
+
+ r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+/* Check if necessary controllers and attributes for a unit are in place.
+ *
+ * - If so, do nothing.
+ * - If not, create paths, move processes over, and set attributes.
+ *
+ * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
+ * a depth-first way. As such the process looks like this:
+ *
+ * Suppose we have a cgroup hierarchy which looks like this:
+ *
+ * root
+ * / \
+ * / \
+ * / \
+ * a b
+ * / \ / \
+ * / \ / \
+ * c d e f
+ * / \ / \ / \ / \
+ * h i j k l m n o
+ *
+ * 1. We want to realise cgroup "d" now.
+ * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
+ * 3. cgroup "k" just started requesting the memory controller.
+ *
+ * To make this work we must do the following in order:
+ *
+ * 1. Disable CPU controller in k, j
+ * 2. Disable CPU controller in d
+ * 3. Enable memory controller in root
+ * 4. Enable memory controller in a
+ * 5. Enable memory controller in d
+ * 6. Enable memory controller in k
+ *
+ * Notice that we need to touch j in one direction, but not the other. We also
+ * don't go beyond d when disabling -- it's up to "a" to get realized if it
+ * wants to disable further. The basic rules are therefore:
+ *
+ * - If you're disabling something, you need to realise all of the cgroups from
+ * your recursive descendants to the root. This starts from the leaves.
+ * - If you're enabling something, you need to realise from the root cgroup
+ * downwards, but you don't need to iterate your recursive descendants.
+ *
+ * Returns 0 on success and < 0 on failure. */
+static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
+ CGroupMask target_mask, enable_mask;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ unit_remove_from_cgroup_realize_queue(u);
+
+ target_mask = unit_get_target_mask(u);
+ enable_mask = unit_get_enable_mask(u);
+
+ if (unit_has_mask_realized(u, target_mask, enable_mask))
+ return 0;
+
+ /* Disable controllers below us, if there are any */
+ r = unit_realize_cgroup_now_disable(u, state);
+ if (r < 0)
+ return r;
+
+ /* Enable controllers above us, if there are any */
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ r = unit_realize_cgroup_now_enable(slice, state);
+ if (r < 0)
+ return r;
+ }
+
+ /* Now actually deal with the cgroup we were trying to realise and set attributes */
+ r = unit_update_cgroup(u, target_mask, enable_mask, state);
+ if (r < 0)
+ return r;
+
+ /* Now, reset the invalidation mask */
+ u->cgroup_invalidated_mask = 0;
+ return 0;
+}
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
+ ManagerState state;
+ unsigned n = 0;
+ Unit *i;
+ int r;
+
+ assert(m);
+
+ state = manager_state(m);
+
+ while ((i = m->cgroup_realize_queue)) {
+ assert(i->in_cgroup_realize_queue);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
+ /* Maybe things changed, and the unit is not actually active anymore? */
+ unit_remove_from_cgroup_realize_queue(i);
+ continue;
+ }
+
+ r = unit_realize_cgroup_now(i, state);
+ if (r < 0)
+ log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
+
+ n++;
+ }
+
+ return n;
+}
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u) {
+ assert(u);
+ assert(u->type == UNIT_SLICE);
+
+ /* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
+ * its ancestors.
+ *
+ * Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
+ * very weird if two units that own processes reside in the same slice, but one is realized in the
+ * "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
+ * not), because that means individual processes need to be scheduled against whole cgroups. Let's
+ * avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
+ * controller hierarchies too (if unit requires the controller to be realized).
+ *
+ * The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
+ * masks. */
+
+ do {
+ Unit *m;
+
+ /* Children of u likely changed when we're called */
+ u->cgroup_members_mask_valid = false;
+
+ UNIT_FOREACH_DEPENDENCY(m, u, UNIT_ATOM_SLICE_OF) {
+
+ /* No point in doing cgroup application for units without active processes. */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
+ continue;
+
+ /* We only enqueue siblings if they were realized once at least, in the main
+ * hierarchy. */
+ if (!m->cgroup_realized)
+ continue;
+
+ /* If the unit doesn't need any new controllers and has current ones
+ * realized, it doesn't need any changes. */
+ if (unit_has_mask_realized(m,
+ unit_get_target_mask(m),
+ unit_get_enable_mask(m)))
+ continue;
+
+ unit_add_to_cgroup_realize_queue(m);
+ }
+
+ /* Parent comes after children */
+ unit_add_to_cgroup_realize_queue(u);
+
+ u = UNIT_GET_SLICE(u);
+ } while (u);
+}
+
+int unit_realize_cgroup(Unit *u) {
+ Unit *slice;
+
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
+ * parents, but there's more actually: for the weight-based controllers we also need to make sure
+ * that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
+ * other hand, when a controller is removed from realized set, it may become unnecessary in siblings
+ * and ancestors and they should be (de)realized too.
+ *
+ * This call will defer work on the siblings and derealized ancestors to the next event loop
+ * iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ unit_add_family_to_cgroup_realize_queue(slice);
+
+ /* And realize this one now (and apply the values) */
+ return unit_realize_cgroup_now(u, manager_state(u->manager));
+}
+
+void unit_release_cgroup(Unit *u) {
+ assert(u);
+
+ /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
+ * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
+
+ if (u->cgroup_path) {
+ (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
+ u->cgroup_path = mfree(u->cgroup_path);
+ }
+
+ if (u->cgroup_control_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
+
+ (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
+ u->cgroup_control_inotify_wd = -1;
+ }
+
+ if (u->cgroup_memory_inotify_wd >= 0) {
+ if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
+ log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
+
+ (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
+ u->cgroup_memory_inotify_wd = -1;
+ }
+}
+
+bool unit_maybe_release_cgroup(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return true;
+
+ /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
+ * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
+ * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
+ * up later. */
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
+ else if (r == 1) {
+ unit_release_cgroup(u);
+ return true;
+ }
+
+ return false;
+}
+
+void unit_prune_cgroup(Unit *u) {
+ int r;
+ bool is_root_slice;
+
+ assert(u);
+
+ /* Removes the cgroup, if empty and possible, and stops watching it. */
+
+ if (!u->cgroup_path)
+ return;
+
+ (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
+
+#if BPF_FRAMEWORK
+ (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
+#endif
+
+ unit_modify_nft_set(u, /* add = */ false);
+
+ is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
+
+ r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
+ if (r < 0)
+ /* One reason we could have failed here is, that the cgroup still contains a process.
+ * However, if the cgroup becomes removable at a later time, it might be removed when
+ * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
+ * that the cgroup is still realized the next time it is started. Do not return early
+ * on error, continue cleanup. */
+ log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ if (is_root_slice)
+ return;
+
+ if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
+ return;
+
+ u->cgroup_realized = false;
+ u->cgroup_realized_mask = 0;
+ u->cgroup_enabled_mask = 0;
+
+ u->bpf_device_control_installed = bpf_program_free(u->bpf_device_control_installed);
+}
+
+int unit_search_main_pid(Unit *u, PidRef *ret) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (!u->cgroup_path)
+ return -ENXIO;
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef npidref = PIDREF_NULL;
+
+ r = cg_read_pidref(f, &npidref);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (pidref_equal(&pidref, &npidref)) /* seen already, cgroupfs reports duplicates! */
+ continue;
+
+ if (pidref_is_my_child(&npidref) <= 0) /* ignore processes further down the tree */
+ continue;
+
+ if (pidref_is_set(&pidref) != 0)
+ /* Dang, there's more than one daemonized PID in this group, so we don't know what
+ * process is the main process. */
+ return -ENODATA;
+
+ pidref = TAKE_PIDREF(npidref);
+ }
+
+ if (!pidref_is_set(&pidref))
+ return -ENODATA;
+
+ *ret = TAKE_PIDREF(pidref);
+ return 0;
+}
+
+static int unit_watch_pids_in_path(Unit *u, const char *path) {
+ _cleanup_closedir_ DIR *d = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int ret = 0, r;
+
+ assert(u);
+ assert(path);
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+ if (r < 0)
+ RET_GATHER(ret, r);
+ else {
+ for (;;) {
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+
+ r = cg_read_pidref(f, &pid);
+ if (r == 0)
+ break;
+ if (r < 0) {
+ RET_GATHER(ret, r);
+ break;
+ }
+
+ RET_GATHER(ret, unit_watch_pidref(u, &pid, /* exclusive= */ false));
+ }
+ }
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+ if (r < 0)
+ RET_GATHER(ret, r);
+ else {
+ for (;;) {
+ _cleanup_free_ char *fn = NULL, *p = NULL;
+
+ r = cg_read_subgroup(d, &fn);
+ if (r == 0)
+ break;
+ if (r < 0) {
+ RET_GATHER(ret, r);
+ break;
+ }
+
+ p = path_join(empty_to_root(path), fn);
+ if (!p)
+ return -ENOMEM;
+
+ RET_GATHER(ret, unit_watch_pids_in_path(u, p));
+ }
+ }
+
+ return ret;
+}
+
+int unit_synthesize_cgroup_empty_event(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
+ * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
+ * get as notification source as soon as we stopped having any useful PIDs to watch for. */
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we have reliable notifications, and don't need this */
+ return 0;
+
+ if (!set_isempty(u->pids))
+ return 0;
+
+ unit_add_to_cgroup_empty_queue(u);
+ return 0;
+}
+
+int unit_watch_all_pids(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Adds all PIDs from our cgroup to the set of PIDs we
+ * watch. This is a fallback logic for cases where we do not
+ * get reliable cgroup empty notifications: we try to use
+ * SIGCHLD as replacement. */
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we can use proper notifications */
+ return 0;
+
+ return unit_watch_pids_in_path(u, u->cgroup_path);
+}
+
+static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(s);
+
+ u = m->cgroup_empty_queue;
+ if (!u)
+ return 0;
+
+ assert(u->in_cgroup_empty_queue);
+ u->in_cgroup_empty_queue = false;
+ LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
+
+ if (m->cgroup_empty_queue) {
+ /* More stuff queued, let's make sure we remain enabled */
+ r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
+ }
+
+ /* Update state based on OOM kills before we notify about cgroup empty event */
+ (void) unit_check_oom(u);
+ (void) unit_check_oomd_kill(u);
+
+ unit_add_to_gc_queue(u);
+
+ if (IN_SET(unit_active_state(u), UNIT_INACTIVE, UNIT_FAILED))
+ unit_prune_cgroup(u);
+ else if (UNIT_VTABLE(u)->notify_cgroup_empty)
+ UNIT_VTABLE(u)->notify_cgroup_empty(u);
+
+ return 0;
+}
+
+void unit_add_to_cgroup_empty_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Note that there are four different ways how cgroup empty events reach us:
+ *
+ * 1. On the unified hierarchy we get an inotify event on the cgroup
+ *
+ * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
+ *
+ * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
+ *
+ * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
+ * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
+ *
+ * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
+ * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
+ * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
+ * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
+ * case for scope units). */
+
+ if (u->in_cgroup_empty_queue)
+ return;
+
+ /* Let's verify that the cgroup is really empty */
+ if (!u->cgroup_path)
+ return;
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+ return;
+ }
+ if (r == 0)
+ return;
+
+ LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+ u->in_cgroup_empty_queue = true;
+
+ /* Trigger the defer event */
+ r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
+}
+
+static void unit_remove_from_cgroup_empty_queue(Unit *u) {
+ assert(u);
+
+ if (!u->in_cgroup_empty_queue)
+ return;
+
+ LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+ u->in_cgroup_empty_queue = false;
+}
+
+int unit_check_oomd_kill(Unit *u) {
+ _cleanup_free_ char *value = NULL;
+ bool increased;
+ uint64_t n = 0;
+ int r;
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
+ else if (r == 0)
+ return 0;
+
+ r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_ooms", &value);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ if (!isempty(value)) {
+ r = safe_atou64(value, &n);
+ if (r < 0)
+ return r;
+ }
+
+ increased = n > u->managed_oom_kill_last;
+ u->managed_oom_kill_last = n;
+
+ if (!increased)
+ return 0;
+
+ n = 0;
+ value = mfree(value);
+ r = cg_get_xattr_malloc(u->cgroup_path, "user.oomd_kill", &value);
+ if (r >= 0 && !isempty(value))
+ (void) safe_atou64(value, &n);
+
+ if (n > 0)
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n),
+ "N_PROCESSES=%" PRIu64, n);
+ else
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit."));
+
+ unit_notify_cgroup_oom(u, /* ManagedOOM= */ true);
+
+ return 1;
+}
+
+int unit_check_oom(Unit *u) {
+ _cleanup_free_ char *oom_kill = NULL;
+ bool increased;
+ uint64_t c;
+ int r;
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
+ if (IN_SET(r, -ENOENT, -ENXIO)) /* Handle gracefully if cgroup or oom_kill attribute don't exist */
+ c = 0;
+ else if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
+ else {
+ r = safe_atou64(oom_kill, &c);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
+ }
+
+ increased = c > u->oom_kill_last;
+ u->oom_kill_last = c;
+
+ if (!increased)
+ return 0;
+
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
+
+ unit_notify_cgroup_oom(u, /* ManagedOOM= */ false);
+
+ return 1;
+}
+
+static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(s);
+
+ u = m->cgroup_oom_queue;
+ if (!u)
+ return 0;
+
+ assert(u->in_cgroup_oom_queue);
+ u->in_cgroup_oom_queue = false;
+ LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
+
+ if (m->cgroup_oom_queue) {
+ /* More stuff queued, let's make sure we remain enabled */
+ r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
+ }
+
+ (void) unit_check_oom(u);
+ unit_add_to_gc_queue(u);
+
+ return 0;
+}
+
+static void unit_add_to_cgroup_oom_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->in_cgroup_oom_queue)
+ return;
+ if (!u->cgroup_path)
+ return;
+
+ LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+ u->in_cgroup_oom_queue = true;
+
+ /* Trigger the defer event */
+ if (!u->manager->cgroup_oom_event_source) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+ r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create cgroup oom event source: %m");
+ return;
+ }
+
+ r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+ if (r < 0) {
+ log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
+ return;
+ }
+
+ (void) sd_event_source_set_description(s, "cgroup-oom");
+ u->manager->cgroup_oom_event_source = TAKE_PTR(s);
+ }
+
+ r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_error_errno(r, "Failed to enable cgroup oom event source: %m");
+}
+
+static int unit_check_cgroup_events(Unit *u) {
+ char *values[2] = {};
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+ STRV_MAKE("populated", "frozen"), values);
+ if (r < 0)
+ return r;
+
+ /* The cgroup.events notifications can be merged together so act as we saw the given state for the
+ * first time. The functions we call to handle given state are idempotent, which makes them
+ * effectively remember the previous state. */
+ if (values[0]) {
+ if (streq(values[0], "1"))
+ unit_remove_from_cgroup_empty_queue(u);
+ else
+ unit_add_to_cgroup_empty_queue(u);
+ }
+
+ /* Disregard freezer state changes due to operations not initiated by us */
+ if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
+ if (streq(values[1], "0"))
+ unit_thawed(u);
+ else
+ unit_frozen(u);
+ }
+
+ free(values[0]);
+ free(values[1]);
+
+ return 0;
+}
+
+static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+ assert(fd >= 0);
+
+ for (;;) {
+ union inotify_event_buffer buffer;
+ ssize_t l;
+
+ l = read(fd, &buffer, sizeof(buffer));
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read control group inotify events: %m");
+ }
+
+ FOREACH_INOTIFY_EVENT_WARN(e, buffer, l) {
+ Unit *u;
+
+ if (e->wd < 0)
+ /* Queue overflow has no watch descriptor */
+ continue;
+
+ if (e->mask & IN_IGNORED)
+ /* The watch was just removed */
+ continue;
+
+ /* Note that inotify might deliver events for a watch even after it was removed,
+ * because it was queued before the removal. Let's ignore this here safely. */
+
+ u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
+ if (u)
+ unit_check_cgroup_events(u);
+
+ u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
+ if (u)
+ unit_add_to_cgroup_oom_queue(u);
+ }
+ }
+}
+
+static int cg_bpf_mask_supported(CGroupMask *ret) {
+ CGroupMask mask = 0;
+ int r;
+
+ /* BPF-based firewall */
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_FIREWALL;
+
+ /* BPF-based device access control */
+ r = bpf_devices_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_DEVICES;
+
+ /* BPF pinned prog */
+ r = bpf_foreign_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_FOREIGN;
+
+ /* BPF-based bind{4|6} hooks */
+ r = bpf_socket_bind_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_SOCKET_BIND;
+
+ /* BPF-based cgroup_skb/{egress|ingress} hooks */
+ r = restrict_network_interfaces_supported();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ mask |= CGROUP_MASK_BPF_RESTRICT_NETWORK_INTERFACES;
+
+ *ret = mask;
+ return 0;
+}
+
+int manager_setup_cgroup(Manager *m) {
+ _cleanup_free_ char *path = NULL;
+ const char *scope_path;
+ int r, all_unified;
+ CGroupMask mask;
+ char *e;
+
+ assert(m);
+
+ /* 1. Determine hierarchy */
+ m->cgroup_root = mfree(m->cgroup_root);
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
+ if (r < 0)
+ return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
+
+ /* Chop off the init scope, if we are already located in it */
+ e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+
+ /* LEGACY: Also chop off the system slice if we are in
+ * it. This is to support live upgrades from older systemd
+ * versions where PID 1 was moved there. Also see
+ * cg_get_root_path(). */
+ if (!e && MANAGER_IS_SYSTEM(m)) {
+ e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
+ if (!e)
+ e = endswith(m->cgroup_root, "/system"); /* even more legacy */
+ }
+ if (e)
+ *e = 0;
+
+ /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
+ * easily prepend it everywhere. */
+ delete_trailing_chars(m->cgroup_root, "/");
+
+ /* 2. Show data */
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
+ if (r < 0)
+ return log_error_errno(r, "Cannot find cgroup mount point: %m");
+
+ r = cg_unified();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
+
+ all_unified = cg_all_unified();
+ if (all_unified < 0)
+ return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
+ if (all_unified > 0)
+ log_debug("Unified cgroup hierarchy is located at %s.", path);
+ else {
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
+ if (r > 0)
+ log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
+ else
+ log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
+ }
+
+ /* 3. Allocate cgroup empty defer event source */
+ m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+ r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create cgroup empty event source: %m");
+
+ /* Schedule cgroup empty checks early, but after having processed service notification messages or
+ * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
+ * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
+ r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
+
+ r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
+
+ /* 4. Install notifier inotify object, or agent */
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
+
+ /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
+
+ m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+ safe_close(m->cgroup_inotify_fd);
+
+ m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (m->cgroup_inotify_fd < 0)
+ return log_error_errno(errno, "Failed to create control group inotify object: %m");
+
+ r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch control group inotify object: %m");
+
+ /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
+ * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
+ * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
+ r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of inotify event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
+
+ } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
+
+ /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
+ * since it does not generate events when control groups with children run empty. */
+
+ r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
+ if (r < 0)
+ log_warning_errno(r, "Failed to install release agent, ignoring: %m");
+ else if (r > 0)
+ log_debug("Installed release agent.");
+ else if (r == 0)
+ log_debug("Release agent already installed.");
+ }
+
+ /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
+ scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+ if (r >= 0) {
+ /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
+ r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
+ if (r < 0)
+ log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
+
+ /* 6. And pin it, so that it cannot be unmounted */
+ safe_close(m->pin_cgroupfs_fd);
+ m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
+ if (m->pin_cgroupfs_fd < 0)
+ return log_error_errno(errno, "Failed to open pin file: %m");
+
+ } else if (!MANAGER_IS_TEST_RUN(m))
+ return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
+
+ /* 7. Always enable hierarchical support if it exists... */
+ if (!all_unified && !MANAGER_IS_TEST_RUN(m))
+ (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
+
+ /* 8. Figure out which controllers are supported */
+ r = cg_mask_supported_subtree(m->cgroup_root, &m->cgroup_supported);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine supported controllers: %m");
+
+ /* 9. Figure out which bpf-based pseudo-controllers are supported */
+ r = cg_bpf_mask_supported(&mask);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
+ m->cgroup_supported |= mask;
+
+ /* 10. Log which controllers are supported */
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
+ log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c),
+ yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
+
+ return 0;
+}
+
+void manager_shutdown_cgroup(Manager *m, bool delete) {
+ assert(m);
+
+ /* We can't really delete the group, since we are in it. But
+ * let's trim it. */
+ if (delete && m->cgroup_root && !FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+
+ m->cgroup_empty_event_source = sd_event_source_disable_unref(m->cgroup_empty_event_source);
+
+ m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
+ m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
+
+ m->cgroup_inotify_event_source = sd_event_source_disable_unref(m->cgroup_inotify_event_source);
+ m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
+
+ m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
+
+ m->cgroup_root = mfree(m->cgroup_root);
+}
+
+Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
+ char *p;
+ Unit *u;
+
+ assert(m);
+ assert(cgroup);
+
+ u = hashmap_get(m->cgroup_unit, cgroup);
+ if (u)
+ return u;
+
+ p = strdupa_safe(cgroup);
+ for (;;) {
+ char *e;
+
+ e = strrchr(p, '/');
+ if (!e || e == p)
+ return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
+
+ *e = 0;
+
+ u = hashmap_get(m->cgroup_unit, p);
+ if (u)
+ return u;
+ }
+}
+
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid) {
+ _cleanup_free_ char *cgroup = NULL;
+
+ assert(m);
+
+ if (cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
+ return NULL;
+
+ return manager_get_unit_by_cgroup(m, cgroup);
+}
+
+Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid) {
+ Unit *u, **array;
+
+ assert(m);
+
+ if (!pidref_is_set(pid))
+ return NULL;
+
+ u = hashmap_get(m->watch_pids, pid);
+ if (u)
+ return u;
+
+ array = hashmap_get(m->watch_pids_more, pid);
+ if (array)
+ return array[0];
+
+ return NULL;
+}
+
+Unit *manager_get_unit_by_pidref(Manager *m, PidRef *pid) {
+ Unit *u;
+
+ assert(m);
+
+ /* Note that a process might be owned by multiple units, we return only one here, which is good
+ * enough for most cases, though not strictly correct. We prefer the one reported by cgroup
+ * membership, as that's the most relevant one as children of the process will be assigned to that
+ * one, too, before all else. */
+
+ if (!pidref_is_set(pid))
+ return NULL;
+
+ if (pidref_is_self(pid))
+ return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
+ if (pid->pid == 1)
+ return NULL;
+
+ u = manager_get_unit_by_pidref_cgroup(m, pid);
+ if (u)
+ return u;
+
+ u = manager_get_unit_by_pidref_watching(m, pid);
+ if (u)
+ return u;
+
+ return NULL;
+}
+
+Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
+ assert(m);
+
+ if (!pid_is_valid(pid))
+ return NULL;
+
+ return manager_get_unit_by_pidref(m, &PIDREF_MAKE_FROM_PID(pid));
+}
+
+int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
+ Unit *u;
+
+ assert(m);
+ assert(cgroup);
+
+ /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
+ * or from the --system instance */
+
+ log_debug("Got cgroup empty notification for: %s", cgroup);
+
+ u = manager_get_unit_by_cgroup(m, cgroup);
+ if (!u)
+ return 0;
+
+ unit_add_to_cgroup_empty_queue(u);
+ return 1;
+}
+
+int unit_get_memory_available(Unit *u, uint64_t *ret) {
+ uint64_t available = UINT64_MAX, current = 0;
+
+ assert(u);
+ assert(ret);
+
+ /* If data from cgroups can be accessed, try to find out how much more memory a unit can
+ * claim before hitting the configured cgroup limits (if any). Consider both MemoryHigh
+ * and MemoryMax, and also any slice the unit might be nested below. */
+
+ do {
+ uint64_t unit_available, unit_limit = UINT64_MAX;
+ CGroupContext *unit_context;
+
+ /* No point in continuing if we can't go any lower */
+ if (available == 0)
+ break;
+
+ unit_context = unit_get_cgroup_context(u);
+ if (!unit_context)
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ continue;
+
+ (void) unit_get_memory_current(u, &current);
+ /* in case of error, previous current propagates as lower bound */
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ unit_limit = physical_memory();
+ else if (unit_context->memory_max == UINT64_MAX && unit_context->memory_high == UINT64_MAX)
+ continue;
+ unit_limit = MIN3(unit_limit, unit_context->memory_max, unit_context->memory_high);
+
+ unit_available = LESS_BY(unit_limit, current);
+ available = MIN(unit_available, available);
+ } while ((u = UNIT_GET_SLICE(u)));
+
+ *ret = available;
+
+ return 0;
+}
+
+int unit_get_memory_current(Unit *u, uint64_t *ret) {
+ int r;
+
+ // FIXME: Merge this into unit_get_memory_accounting after support for cgroup v1 is dropped
+
+ assert(u);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_memory_get_used(ret);
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+
+ return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
+}
+
+int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret) {
+
+ static const char* const attributes_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_MEMORY_PEAK] = "memory.peak",
+ [CGROUP_MEMORY_SWAP_CURRENT] = "memory.swap.current",
+ [CGROUP_MEMORY_SWAP_PEAK] = "memory.swap.peak",
+ [CGROUP_MEMORY_ZSWAP_CURRENT] = "memory.zswap.current",
+ };
+
+ uint64_t bytes;
+ bool updated = false;
+ int r;
+
+ assert(u);
+ assert(metric >= 0);
+ assert(metric < _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX);
+
+ if (!UNIT_CGROUP_BOOL(u, memory_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ /* If the cgroup is already gone, we try to find the last cached value. */
+ goto finish;
+
+ /* The root cgroup doesn't expose this information. */
+ if (unit_has_host_root_cgroup(u))
+ return -ENODATA;
+
+ if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_MEMORY))
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENODATA;
+
+ r = cg_get_attribute_as_uint64("memory", u->cgroup_path, attributes_table[metric], &bytes);
+ if (r < 0 && r != -ENODATA)
+ return r;
+ updated = r >= 0;
+
+finish:
+ if (metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST) {
+ uint64_t *last = &u->memory_accounting_last[metric];
+
+ if (updated)
+ *last = bytes;
+ else if (*last != UINT64_MAX)
+ bytes = *last;
+ else
+ return -ENODATA;
+
+ } else if (!updated)
+ return -ENODATA;
+
+ if (ret)
+ *ret = bytes;
+
+ return 0;
+}
+
+int unit_get_tasks_current(Unit *u, uint64_t *ret) {
+ assert(u);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
+ return -ENODATA;
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_tasks_get_current(ret);
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
+ return -ENODATA;
+
+ return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
+}
+
+static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
+ uint64_t ns;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ /* The root cgroup doesn't expose this information, let's get it from /proc instead */
+ if (unit_has_host_root_cgroup(u))
+ return procfs_cpu_get_usage(ret);
+
+ /* Requisite controllers for CPU accounting are not enabled */
+ if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ _cleanup_free_ char *val = NULL;
+ uint64_t us;
+
+ r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
+ if (IN_SET(r, -ENOENT, -ENXIO))
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ r = safe_atou64(val, &us);
+ if (r < 0)
+ return r;
+
+ ns = us * NSEC_PER_USEC;
+ } else
+ return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
+
+ *ret = ns;
+ return 0;
+}
+
+int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
+ nsec_t ns;
+ int r;
+
+ assert(u);
+
+ /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
+ * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
+ * call this function with a NULL return value. */
+
+ if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
+ return -ENODATA;
+
+ r = unit_get_cpu_usage_raw(u, &ns);
+ if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
+ /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
+ * cached value. */
+
+ if (ret)
+ *ret = u->cpu_usage_last;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ if (ns > u->cpu_usage_base)
+ ns -= u->cpu_usage_base;
+ else
+ ns = 0;
+
+ u->cpu_usage_last = ns;
+ if (ret)
+ *ret = ns;
+
+ return 0;
+}
+
+int unit_get_ip_accounting(
+ Unit *u,
+ CGroupIPAccountingMetric metric,
+ uint64_t *ret) {
+
+ uint64_t value;
+ int fd, r;
+
+ assert(u);
+ assert(metric >= 0);
+ assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
+ assert(ret);
+
+ if (!UNIT_CGROUP_BOOL(u, ip_accounting))
+ return -ENODATA;
+
+ fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
+ u->ip_accounting_ingress_map_fd :
+ u->ip_accounting_egress_map_fd;
+ if (fd < 0)
+ return -ENODATA;
+
+ if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+ r = bpf_firewall_read_accounting(fd, &value, NULL);
+ else
+ r = bpf_firewall_read_accounting(fd, NULL, &value);
+ if (r < 0)
+ return r;
+
+ /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
+ * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
+ * ip_accounting_extra[] field, and add them in here transparently. */
+
+ *ret = value + u->ip_accounting_extra[metric];
+
+ return r;
+}
+
+static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
+ static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "rbytes=",
+ [CGROUP_IO_WRITE_BYTES] = "wbytes=",
+ [CGROUP_IO_READ_OPERATIONS] = "rios=",
+ [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
+ };
+ uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
+ _cleanup_free_ char *path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ if (unit_has_host_root_cgroup(u))
+ return -ENODATA; /* TODO: return useful data for the top-level cgroup */
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0) /* TODO: support cgroupv1 */
+ return -ENODATA;
+
+ if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
+ return -ENODATA;
+
+ r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
+ if (r < 0)
+ return r;
+
+ f = fopen(path, "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ const char *p;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = line;
+ p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
+ p += strspn(p, WHITESPACE); /* Skip over following whitespace */
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+ const char *x;
+
+ x = startswith(word, field_names[i]);
+ if (x) {
+ uint64_t w;
+
+ r = safe_atou64(x, &w);
+ if (r < 0)
+ return r;
+
+ /* Sum up the stats of all devices */
+ acc[i] += w;
+ break;
+ }
+ }
+ }
+ }
+
+ memcpy(ret, acc, sizeof(acc));
+ return 0;
+}
+
+int unit_get_io_accounting(
+ Unit *u,
+ CGroupIOAccountingMetric metric,
+ bool allow_cache,
+ uint64_t *ret) {
+
+ uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+ int r;
+
+ /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
+
+ if (!UNIT_CGROUP_BOOL(u, io_accounting))
+ return -ENODATA;
+
+ if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
+ goto done;
+
+ r = unit_get_io_accounting_raw(u, raw);
+ if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
+ goto done;
+ if (r < 0)
+ return r;
+
+ for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
+ /* Saturated subtraction */
+ if (raw[i] > u->io_accounting_base[i])
+ u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
+ else
+ u->io_accounting_last[i] = 0;
+ }
+
+done:
+ if (ret)
+ *ret = u->io_accounting_last[metric];
+
+ return 0;
+}
+
+int unit_reset_cpu_accounting(Unit *u) {
+ int r;
+
+ assert(u);
+
+ u->cpu_usage_last = NSEC_INFINITY;
+
+ r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
+ if (r < 0) {
+ u->cpu_usage_base = 0;
+ return r;
+ }
+
+ return 0;
+}
+
+void unit_reset_memory_accounting_last(Unit *u) {
+ assert(u);
+
+ FOREACH_ARRAY(i, u->memory_accounting_last, ELEMENTSOF(u->memory_accounting_last))
+ *i = UINT64_MAX;
+}
+
+int unit_reset_ip_accounting(Unit *u) {
+ int r = 0;
+
+ assert(u);
+
+ if (u->ip_accounting_ingress_map_fd >= 0)
+ RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd));
+
+ if (u->ip_accounting_egress_map_fd >= 0)
+ RET_GATHER(r, bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd));
+
+ zero(u->ip_accounting_extra);
+
+ return r;
+}
+
+void unit_reset_io_accounting_last(Unit *u) {
+ assert(u);
+
+ FOREACH_ARRAY(i, u->io_accounting_last, _CGROUP_IO_ACCOUNTING_METRIC_MAX)
+ *i = UINT64_MAX;
+}
+
+int unit_reset_io_accounting(Unit *u) {
+ int r;
+
+ assert(u);
+
+ unit_reset_io_accounting_last(u);
+
+ r = unit_get_io_accounting_raw(u, u->io_accounting_base);
+ if (r < 0) {
+ zero(u->io_accounting_base);
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_reset_accounting(Unit *u) {
+ int r = 0;
+
+ assert(u);
+
+ RET_GATHER(r, unit_reset_cpu_accounting(u));
+ RET_GATHER(r, unit_reset_io_accounting(u));
+ RET_GATHER(r, unit_reset_ip_accounting(u));
+ unit_reset_memory_accounting_last(u);
+
+ return r;
+}
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ if (m == 0)
+ return;
+
+ /* always invalidate compat pairs together */
+ if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
+ m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
+
+ if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
+ m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
+
+ if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
+ return;
+
+ u->cgroup_invalidated_mask |= m;
+ unit_add_to_cgroup_realize_queue(u);
+}
+
+void unit_invalidate_cgroup_bpf(Unit *u) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
+ return;
+
+ u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+ unit_add_to_cgroup_realize_queue(u);
+
+ /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
+ * list of our children includes our own. */
+ if (u->type == UNIT_SLICE) {
+ Unit *member;
+
+ UNIT_FOREACH_DEPENDENCY(member, u, UNIT_ATOM_SLICE_OF)
+ unit_invalidate_cgroup_bpf(member);
+ }
+}
+
+void unit_cgroup_catchup(Unit *u) {
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return;
+
+ /* We dropped the inotify watch during reexec/reload, so we need to
+ * check these as they may have changed.
+ * Note that (currently) the kernel doesn't actually update cgroup
+ * file modification times, so we can't just serialize and then check
+ * the mtime for file(s) we are interested in. */
+ (void) unit_check_cgroup_events(u);
+ unit_add_to_cgroup_oom_queue(u);
+}
+
+bool unit_cgroup_delegate(Unit *u) {
+ CGroupContext *c;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return false;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return false;
+
+ return c->delegate;
+}
+
+void manager_invalidate_startup_units(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ SET_FOREACH(u, m->startup_units)
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO|CGROUP_MASK_CPUSET);
+}
+
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
+ _cleanup_free_ char *path = NULL;
+ FreezerState target, kernel = _FREEZER_STATE_INVALID;
+ int r, ret;
+
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (!cg_freezer_supported())
+ return 0;
+
+ /* Ignore all requests to thaw init.scope or -.slice and reject all requests to freeze them */
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE) || unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return action == FREEZER_FREEZE ? -EPERM : 0;
+
+ if (!u->cgroup_realized)
+ return -EBUSY;
+
+ if (action == FREEZER_THAW) {
+ Unit *slice = UNIT_GET_SLICE(u);
+
+ if (slice) {
+ r = unit_cgroup_freezer_action(slice, FREEZER_THAW);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to thaw slice %s of unit: %m", slice->id);
+ }
+ }
+
+ target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
+
+ r = unit_freezer_state_kernel(u, &kernel);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
+
+ if (target == kernel) {
+ u->freezer_state = target;
+ if (action == FREEZER_FREEZE)
+ return 0;
+ ret = 0;
+ } else
+ ret = 1;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
+ if (r < 0)
+ return r;
+
+ log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
+
+ if (target != kernel) {
+ if (action == FREEZER_FREEZE)
+ u->freezer_state = FREEZER_FREEZING;
+ else
+ u->freezer_state = FREEZER_THAWING;
+ }
+
+ r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ return r;
+
+ return ret;
+}
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
+ _cleanup_free_ char *v = NULL;
+ int r;
+
+ assert(u);
+ assert(cpus);
+
+ if (!u->cgroup_path)
+ return -ENODATA;
+
+ if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
+ return -ENODATA;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENODATA;
+
+ r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
+ if (r == -ENOENT)
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
+}
+
+static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
+ [CGROUP_DEVICE_POLICY_AUTO] = "auto",
+ [CGROUP_DEVICE_POLICY_CLOSED] = "closed",
+ [CGROUP_DEVICE_POLICY_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
+
+static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
+ [FREEZER_FREEZE] = "freeze",
+ [FREEZER_THAW] = "thaw",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
+
+static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
+ [CGROUP_PRESSURE_WATCH_OFF] = "off",
+ [CGROUP_PRESSURE_WATCH_AUTO] = "auto",
+ [CGROUP_PRESSURE_WATCH_ON] = "on",
+ [CGROUP_PRESSURE_WATCH_SKIP] = "skip",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON);
+
+static const char* const cgroup_ip_accounting_metric_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "IPIngressBytes",
+ [CGROUP_IP_EGRESS_BYTES] = "IPEgressBytes",
+ [CGROUP_IP_INGRESS_PACKETS] = "IPIngressPackets",
+ [CGROUP_IP_EGRESS_PACKETS] = "IPEgressPackets",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_ip_accounting_metric, CGroupIPAccountingMetric);
+
+static const char* const cgroup_io_accounting_metric_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "IOReadBytes",
+ [CGROUP_IO_WRITE_BYTES] = "IOWriteBytes",
+ [CGROUP_IO_READ_OPERATIONS] = "IOReadOperations",
+ [CGROUP_IO_WRITE_OPERATIONS] = "IOWriteOperations",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_io_accounting_metric, CGroupIOAccountingMetric);
+
+static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_MEMORY_PEAK] = "MemoryPeak",
+ [CGROUP_MEMORY_SWAP_CURRENT] = "MemorySwapCurrent",
+ [CGROUP_MEMORY_SWAP_PEAK] = "MemorySwapPeak",
+ [CGROUP_MEMORY_ZSWAP_CURRENT] = "MemoryZSwapCurrent",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric);
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
new file mode 100644
index 0000000..f1b674b
--- /dev/null
+++ b/src/core/cgroup.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bpf-lsm.h"
+#include "cgroup-util.h"
+#include "cpu-set-util.h"
+#include "firewall-util.h"
+#include "list.h"
+#include "pidref.h"
+#include "time-util.h"
+
+typedef struct CGroupTasksMax {
+ /* If scale == 0, just use value; otherwise, value / scale.
+ * See tasks_max_resolve(). */
+ uint64_t value;
+ uint64_t scale;
+} CGroupTasksMax;
+
+#define CGROUP_TASKS_MAX_UNSET ((CGroupTasksMax) { .value = UINT64_MAX, .scale = 0 })
+
+static inline bool cgroup_tasks_max_isset(const CGroupTasksMax *tasks_max) {
+ return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
+}
+
+uint64_t cgroup_tasks_max_resolve(const CGroupTasksMax *tasks_max);
+
+typedef struct CGroupContext CGroupContext;
+typedef struct CGroupDeviceAllow CGroupDeviceAllow;
+typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
+typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
+typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
+typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
+typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
+typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram;
+typedef struct CGroupSocketBindItem CGroupSocketBindItem;
+
+typedef enum CGroupDevicePolicy {
+ /* When devices listed, will allow those, plus built-in ones, if none are listed will allow
+ * everything. */
+ CGROUP_DEVICE_POLICY_AUTO,
+
+ /* Everything forbidden, except built-in ones and listed ones. */
+ CGROUP_DEVICE_POLICY_CLOSED,
+
+ /* Everything forbidden, except for the listed devices */
+ CGROUP_DEVICE_POLICY_STRICT,
+
+ _CGROUP_DEVICE_POLICY_MAX,
+ _CGROUP_DEVICE_POLICY_INVALID = -EINVAL,
+} CGroupDevicePolicy;
+
+typedef enum FreezerAction {
+ FREEZER_FREEZE,
+ FREEZER_THAW,
+
+ _FREEZER_ACTION_MAX,
+ _FREEZER_ACTION_INVALID = -EINVAL,
+} FreezerAction;
+
+typedef enum CGroupDevicePermissions {
+ /* We reuse the same bit meanings the kernel's BPF_DEVCG_ACC_xyz definitions use */
+ CGROUP_DEVICE_MKNOD = 1 << 0,
+ CGROUP_DEVICE_READ = 1 << 1,
+ CGROUP_DEVICE_WRITE = 1 << 2,
+ _CGROUP_DEVICE_PERMISSIONS_MAX = 1 << 3,
+ _CGROUP_DEVICE_PERMISSIONS_ALL = _CGROUP_DEVICE_PERMISSIONS_MAX - 1,
+ _CGROUP_DEVICE_PERMISSIONS_INVALID = -EINVAL,
+} CGroupDevicePermissions;
+
+struct CGroupDeviceAllow {
+ LIST_FIELDS(CGroupDeviceAllow, device_allow);
+ char *path;
+ CGroupDevicePermissions permissions;
+};
+
+struct CGroupIODeviceWeight {
+ LIST_FIELDS(CGroupIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupIODeviceLimit {
+ LIST_FIELDS(CGroupIODeviceLimit, device_limits);
+ char *path;
+ uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
+};
+
+struct CGroupIODeviceLatency {
+ LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
+ char *path;
+ usec_t target_usec;
+};
+
+struct CGroupBlockIODeviceWeight {
+ LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
+ char *path;
+ uint64_t weight;
+};
+
+struct CGroupBlockIODeviceBandwidth {
+ LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
+ char *path;
+ uint64_t rbps;
+ uint64_t wbps;
+};
+
+struct CGroupBPFForeignProgram {
+ LIST_FIELDS(CGroupBPFForeignProgram, programs);
+ uint32_t attach_type;
+ char *bpffs_path;
+};
+
+struct CGroupSocketBindItem {
+ LIST_FIELDS(CGroupSocketBindItem, socket_bind_items);
+ int address_family;
+ int ip_protocol;
+ uint16_t nr_ports;
+ uint16_t port_min;
+};
+
+typedef enum CGroupPressureWatch {
+ CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
+ CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
+ CGROUP_PRESSURE_WATCH_ON,
+ CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
+ _CGROUP_PRESSURE_WATCH_MAX,
+ _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
+} CGroupPressureWatch;
+
+struct CGroupContext {
+ bool cpu_accounting;
+ bool io_accounting;
+ bool blockio_accounting;
+ bool memory_accounting;
+ bool tasks_accounting;
+ bool ip_accounting;
+
+ /* Configures the memory.oom.group attribute (on unified) */
+ bool memory_oom_group;
+
+ bool delegate;
+ CGroupMask delegate_controllers;
+ CGroupMask disable_controllers;
+ char *delegate_subgroup;
+
+ /* For unified hierarchy */
+ uint64_t cpu_weight;
+ uint64_t startup_cpu_weight;
+ usec_t cpu_quota_per_sec_usec;
+ usec_t cpu_quota_period_usec;
+
+ CPUSet cpuset_cpus;
+ CPUSet startup_cpuset_cpus;
+ CPUSet cpuset_mems;
+ CPUSet startup_cpuset_mems;
+
+ uint64_t io_weight;
+ uint64_t startup_io_weight;
+ LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
+ LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
+ LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
+
+ uint64_t default_memory_min;
+ uint64_t default_memory_low;
+ uint64_t default_startup_memory_low;
+ uint64_t memory_min;
+ uint64_t memory_low;
+ uint64_t startup_memory_low;
+ uint64_t memory_high;
+ uint64_t startup_memory_high;
+ uint64_t memory_max;
+ uint64_t startup_memory_max;
+ uint64_t memory_swap_max;
+ uint64_t startup_memory_swap_max;
+ uint64_t memory_zswap_max;
+ uint64_t startup_memory_zswap_max;
+
+ bool default_memory_min_set:1;
+ bool default_memory_low_set:1;
+ bool default_startup_memory_low_set:1;
+ bool memory_min_set:1;
+ bool memory_low_set:1;
+ bool startup_memory_low_set:1;
+ bool startup_memory_high_set:1;
+ bool startup_memory_max_set:1;
+ bool startup_memory_swap_max_set:1;
+ bool startup_memory_zswap_max_set:1;
+
+ Set *ip_address_allow;
+ Set *ip_address_deny;
+ /* These two flags indicate that redundant entries have been removed from
+ * ip_address_allow/ip_address_deny, i.e. in_addr_prefixes_reduce() has already been called. */
+ bool ip_address_allow_reduced;
+ bool ip_address_deny_reduced;
+
+ char **ip_filters_ingress;
+ char **ip_filters_egress;
+ LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs);
+
+ Set *restrict_network_interfaces;
+ bool restrict_network_interfaces_is_allow_list;
+
+ /* For legacy hierarchies */
+ uint64_t cpu_shares;
+ uint64_t startup_cpu_shares;
+
+ uint64_t blockio_weight;
+ uint64_t startup_blockio_weight;
+ LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
+ LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
+
+ uint64_t memory_limit;
+
+ CGroupDevicePolicy device_policy;
+ LIST_HEAD(CGroupDeviceAllow, device_allow);
+
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_allow);
+ LIST_HEAD(CGroupSocketBindItem, socket_bind_deny);
+
+ /* Common */
+ CGroupTasksMax tasks_max;
+
+ /* Settings for systemd-oomd */
+ ManagedOOMMode moom_swap;
+ ManagedOOMMode moom_mem_pressure;
+ uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
+ ManagedOOMPreference moom_preference;
+
+ /* Memory pressure logic */
+ CGroupPressureWatch memory_pressure_watch;
+ usec_t memory_pressure_threshold_usec;
+ /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
+ * triggers, nor triggers for non-memory pressure. We might add that later. */
+
+ NFTSetContext nft_set_context;
+
+ /* Forward coredumps for processes that crash within this cgroup.
+ * Requires 'delegate' to also be true. */
+ bool coredump_receive;
+};
+
+/* Used when querying IP accounting data */
+typedef enum CGroupIPAccountingMetric {
+ CGROUP_IP_INGRESS_BYTES,
+ CGROUP_IP_INGRESS_PACKETS,
+ CGROUP_IP_EGRESS_BYTES,
+ CGROUP_IP_EGRESS_PACKETS,
+ _CGROUP_IP_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIPAccountingMetric;
+
+/* Used when querying IO accounting data */
+typedef enum CGroupIOAccountingMetric {
+ CGROUP_IO_READ_BYTES,
+ CGROUP_IO_WRITE_BYTES,
+ CGROUP_IO_READ_OPERATIONS,
+ CGROUP_IO_WRITE_OPERATIONS,
+ _CGROUP_IO_ACCOUNTING_METRIC_MAX,
+ _CGROUP_IO_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupIOAccountingMetric;
+
+typedef enum CGroupMemoryAccountingMetric {
+ CGROUP_MEMORY_PEAK,
+ CGROUP_MEMORY_SWAP_PEAK,
+ /* We cache the above attributes, so that they can be fetched even after the cgroup is gone, e.g.
+ * when systemd-run exits. */
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST = CGROUP_MEMORY_SWAP_PEAK,
+
+ /* These attributes are transient, so no need for caching. */
+ CGROUP_MEMORY_SWAP_CURRENT,
+ CGROUP_MEMORY_ZSWAP_CURRENT,
+
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_MAX,
+ _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL,
+} CGroupMemoryAccountingMetric;
+
+typedef struct Unit Unit;
+typedef struct Manager Manager;
+typedef enum ManagerState ManagerState;
+
+uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
+
+usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
+
+void cgroup_context_init(CGroupContext *c);
+void cgroup_context_done(CGroupContext *c);
+void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
+void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
+void cgroup_context_dump_socket_bind_items(const CGroupSocketBindItem *items, FILE *f);
+
+void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
+void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
+void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
+void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
+void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
+void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
+void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
+void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
+
+static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
+ assert(c);
+
+ return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
+ (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
+}
+
+int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
+int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
+
+void unit_modify_nft_set(Unit *u, bool add);
+
+CGroupMask unit_get_own_mask(Unit *u);
+CGroupMask unit_get_delegate_mask(Unit *u);
+CGroupMask unit_get_members_mask(Unit *u);
+CGroupMask unit_get_siblings_mask(Unit *u);
+CGroupMask unit_get_ancestor_disable_mask(Unit *u);
+
+CGroupMask unit_get_target_mask(Unit *u);
+CGroupMask unit_get_enable_mask(Unit *u);
+
+void unit_invalidate_cgroup_members_masks(Unit *u);
+
+void unit_add_family_to_cgroup_realize_queue(Unit *u);
+
+const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
+int unit_default_cgroup_path(const Unit *u, char **ret);
+int unit_set_cgroup_path(Unit *u, const char *path);
+int unit_pick_cgroup_path(Unit *u);
+
+int unit_realize_cgroup(Unit *u);
+void unit_prune_cgroup(Unit *u);
+int unit_watch_cgroup(Unit *u);
+int unit_watch_cgroup_memory(Unit *u);
+void unit_add_to_cgroup_realize_queue(Unit *u);
+
+void unit_release_cgroup(Unit *u);
+/* Releases the cgroup only if it is recursively empty.
+ * Returns true if the cgroup was released, false otherwise. */
+bool unit_maybe_release_cgroup(Unit *u);
+
+void unit_add_to_cgroup_empty_queue(Unit *u);
+int unit_check_oomd_kill(Unit *u);
+int unit_check_oom(Unit *u);
+
+int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
+
+int manager_setup_cgroup(Manager *m);
+void manager_shutdown_cgroup(Manager *m, bool delete);
+
+unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
+
+Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
+Unit *manager_get_unit_by_pidref_cgroup(Manager *m, PidRef *pid);
+Unit *manager_get_unit_by_pidref_watching(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pidref(Manager *m, PidRef *pid);
+Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
+
+uint64_t unit_get_ancestor_memory_min(Unit *u);
+uint64_t unit_get_ancestor_memory_low(Unit *u);
+uint64_t unit_get_ancestor_startup_memory_low(Unit *u);
+
+int unit_search_main_pid(Unit *u, PidRef *ret);
+int unit_watch_all_pids(Unit *u);
+
+int unit_synthesize_cgroup_empty_event(Unit *u);
+
+int unit_get_memory_available(Unit *u, uint64_t *ret);
+int unit_get_memory_current(Unit *u, uint64_t *ret);
+int unit_get_memory_accounting(Unit *u, CGroupMemoryAccountingMetric metric, uint64_t *ret);
+int unit_get_tasks_current(Unit *u, uint64_t *ret);
+int unit_get_cpu_usage(Unit *u, nsec_t *ret);
+int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
+int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
+
+int unit_reset_cpu_accounting(Unit *u);
+void unit_reset_memory_accounting_last(Unit *u);
+int unit_reset_ip_accounting(Unit *u);
+void unit_reset_io_accounting_last(Unit *u);
+int unit_reset_io_accounting(Unit *u);
+int unit_reset_accounting(Unit *u);
+
+#define UNIT_CGROUP_BOOL(u, name) \
+ ({ \
+ CGroupContext *cc = unit_get_cgroup_context(u); \
+ cc ? cc->name : false; \
+ })
+
+bool manager_owns_host_root_cgroup(Manager *m);
+bool unit_has_host_root_cgroup(Unit *u);
+
+bool unit_has_startup_cgroup_constraints(Unit *u);
+
+int manager_notify_cgroup_empty(Manager *m, const char *group);
+
+void unit_invalidate_cgroup(Unit *u, CGroupMask m);
+void unit_invalidate_cgroup_bpf(Unit *u);
+
+void manager_invalidate_startup_units(Manager *m);
+
+const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
+CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
+
+void unit_cgroup_catchup(Unit *u);
+
+bool unit_cgroup_delegate(Unit *u);
+
+int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
+int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
+
+const char* freezer_action_to_string(FreezerAction a) _const_;
+FreezerAction freezer_action_from_string(const char *s) _pure_;
+
+const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
+CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;
+
+const char *cgroup_device_permissions_to_string(CGroupDevicePermissions p) _const_;
+CGroupDevicePermissions cgroup_device_permissions_from_string(const char *s) _pure_;
+
+const char* cgroup_ip_accounting_metric_to_string(CGroupIPAccountingMetric m) _const_;
+CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_;
+CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_;
+
+const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_;
+CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_;
diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c
new file mode 100644
index 0000000..cd91381
--- /dev/null
+++ b/src/core/core-varlink.c
@@ -0,0 +1,652 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "core-varlink.h"
+#include "mkdir-label.h"
+#include "strv.h"
+#include "user-util.h"
+#include "varlink.h"
+#include "varlink-io.systemd.UserDatabase.h"
+#include "varlink-io.systemd.ManagedOOM.h"
+
+typedef struct LookupParameters {
+ const char *user_name;
+ const char *group_name;
+ union {
+ uid_t uid;
+ gid_t gid;
+ };
+ const char *service;
+} LookupParameters;
+
+static const char* const managed_oom_mode_properties[] = {
+ "ManagedOOMSwap",
+ "ManagedOOMMemoryPressure",
+};
+
+static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) {
+ assert(user_name);
+ assert(uid_is_valid(uid));
+ assert(ret);
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(user_name)),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(uid)),
+ JSON_BUILD_PAIR("realName", JSON_BUILD_CONST_STRING("Dynamic User")),
+ JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")),
+ JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)),
+ JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+}
+
+static bool user_match_lookup_parameters(LookupParameters *p, const char *name, uid_t uid) {
+ assert(p);
+
+ if (p->user_name && !streq(name, p->user_name))
+ return false;
+
+ if (uid_is_valid(p->uid) && uid != p->uid)
+ return false;
+
+ return true;
+}
+
+static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) {
+ bool use_limit = false;
+ CGroupContext *c;
+ const char *mode;
+
+ assert(u);
+ assert(property);
+ assert(ret_v);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return -EOPNOTSUPP;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return -EINVAL;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ /* systemd-oomd should always treat inactive units as though they didn't enable any action since they
+ * should not have a valid cgroup */
+ mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO);
+ else if (streq(property, "ManagedOOMSwap"))
+ mode = managed_oom_mode_to_string(c->moom_swap);
+ else if (streq(property, "ManagedOOMMemoryPressure")) {
+ mode = managed_oom_mode_to_string(c->moom_mem_pressure);
+ use_limit = true;
+ } else
+ return -EINVAL;
+
+ return json_build(ret_v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)),
+ JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)),
+ JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)),
+ JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))));
+}
+
+int manager_varlink_send_managed_oom_update(Unit *u) {
+ _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL;
+ CGroupContext *c;
+ int r;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(u->manager)) {
+ /* In system mode we can't send any notifications unless oomd connected back to us. In this
+ * mode oomd must initiate communication, not us. */
+ if (!u->manager->managed_oom_varlink)
+ return 0;
+ } else {
+ /* If we are in user mode, let's connect to oomd if we aren't connected yet. In this mode we
+ * must initiate communication to oomd, not the other way round. */
+ r = manager_varlink_init(u->manager);
+ if (r <= 0)
+ return r;
+ }
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ /* in system mode, oomd is our client, thus send out notifications as replies to the
+ * initiating method call from them. */
+ r = varlink_notify(u->manager->managed_oom_varlink, v);
+ else
+ /* in user mode, we are oomd's client, thus send out notifications as method calls that do
+ * not expect a reply. */
+ r = varlink_send(u->manager->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+
+ return r;
+}
+
+static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (UnitType t = 0; t < _UNIT_TYPE_MAX; t++) {
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ continue;
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[t]) {
+ CGroupContext *c;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ continue;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ continue;
+
+ for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ /* For the initial varlink call we only care about units that enabled (i.e. mode is not
+ * set to "auto") oomd properties. */
+ if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
+ !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+ continue;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+static int vl_method_subscribe_managed_oom_cgroups(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ pid_t pid;
+ Unit *u;
+ int r;
+
+ assert(link);
+
+ r = varlink_get_peer_pid(link, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ /* This is meant to be a deterrent and not actual security. The alternative is to check for the systemd-oom
+ * user that this unit runs as, but NSS lookups are blocking and not allowed from PID 1. */
+ if (!streq(u->id, "systemd-oomd.service"))
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+ if (json_variant_elements(parameters) > 0)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ /* We only take one subscriber for this method so return an error if there's already an existing one.
+ * This shouldn't happen since systemd-oomd is the only client of this method. */
+ if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink)
+ return varlink_error(link, "io.systemd.ManagedOOM.SubscriptionTaken", NULL);
+
+ r = build_managed_oom_cgroups_json(m, &v);
+ if (r < 0)
+ return r;
+
+ if (!FLAGS_SET(flags, VARLINK_METHOD_MORE))
+ return varlink_reply(link, v);
+
+ assert(!m->managed_oom_varlink);
+ m->managed_oom_varlink = varlink_ref(link);
+ return varlink_notify(m->managed_oom_varlink, v);
+}
+
+static int manager_varlink_send_managed_oom_initial(Manager *m) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ assert(m->managed_oom_varlink);
+
+ r = build_managed_oom_cgroups_json(m, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_send(m->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
+}
+
+static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, uid), 0 },
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .uid = UID_INVALID,
+ };
+ _cleanup_free_ char *found_name = NULL;
+ uid_t found_uid = UID_INVALID, uid;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *un;
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (uid_is_valid(p.uid))
+ r = dynamic_user_lookup_uid(m, p.uid, &found_name);
+ else if (p.user_name)
+ r = dynamic_user_lookup_name(m, p.user_name, &found_uid);
+ else {
+ DynamicUser *d;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN) /* not realized yet? */
+ continue;
+ if (r < 0)
+ return r;
+
+ if (!user_match_lookup_parameters(&p, d->name, uid))
+ continue;
+
+ if (v) {
+ r = varlink_notify(link, v);
+ if (r < 0)
+ return r;
+
+ v = json_variant_unref(v);
+ }
+
+ r = build_user_json(d->name, uid, &v);
+ if (r < 0)
+ return r;
+ }
+
+ if (!v)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+ return varlink_reply(link, v);
+ }
+ if (r == -ESRCH)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+ if (r < 0)
+ return r;
+
+ uid = uid_is_valid(found_uid) ? found_uid : p.uid;
+ un = found_name ?: p.user_name;
+
+ if (!user_match_lookup_parameters(&p, un, uid))
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ r = build_user_json(un, uid, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_reply(link, v);
+}
+
+static int build_group_json(const char *group_name, gid_t gid, JsonVariant **ret) {
+ assert(group_name);
+ assert(gid_is_valid(gid));
+ assert(ret);
+
+ return json_build(ret, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("record", JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(group_name)),
+ JSON_BUILD_PAIR("description", JSON_BUILD_CONST_STRING("Dynamic Group")),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid)),
+ JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.DynamicUser")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("dynamic"))))));
+ }
+
+static bool group_match_lookup_parameters(LookupParameters *p, const char *name, gid_t gid) {
+ assert(p);
+
+ if (p->group_name && !streq(name, p->group_name))
+ return false;
+
+ if (gid_is_valid(p->gid) && gid != p->gid)
+ return false;
+
+ return true;
+}
+
+static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(LookupParameters, gid), 0 },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ LookupParameters p = {
+ .gid = GID_INVALID,
+ };
+ _cleanup_free_ char *found_name = NULL;
+ uid_t found_gid = GID_INVALID, gid;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *gn;
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ if (gid_is_valid(p.gid))
+ r = dynamic_user_lookup_uid(m, (uid_t) p.gid, &found_name);
+ else if (p.group_name)
+ r = dynamic_user_lookup_name(m, p.group_name, (uid_t*) &found_gid);
+ else {
+ DynamicUser *d;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ uid_t uid;
+
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN)
+ continue;
+ if (r < 0)
+ return r;
+
+ if (!group_match_lookup_parameters(&p, d->name, (gid_t) uid))
+ continue;
+
+ if (v) {
+ r = varlink_notify(link, v);
+ if (r < 0)
+ return r;
+
+ v = json_variant_unref(v);
+ }
+
+ r = build_group_json(d->name, (gid_t) uid, &v);
+ if (r < 0)
+ return r;
+ }
+
+ if (!v)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+
+ return varlink_reply(link, v);
+ }
+ if (r == -ESRCH)
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+ if (r < 0)
+ return r;
+
+ gid = gid_is_valid(found_gid) ? found_gid : p.gid;
+ gn = found_name ?: p.group_name;
+
+ if (!group_match_lookup_parameters(&p, gn, gid))
+ return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+ r = build_group_json(gn, gid, &v);
+ if (r < 0)
+ return r;
+
+ return varlink_reply(link, v);
+}
+
+static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+ static const JsonDispatch dispatch_table[] = {
+ { "userName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name), JSON_SAFE },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service), 0 },
+ {}
+ };
+
+ LookupParameters p = {};
+ int r;
+
+ assert(parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &p);
+ if (r != 0)
+ return r;
+
+ if (!streq_ptr(p.service, "io.systemd.DynamicUser"))
+ return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+ /* We don't support auxiliary groups with dynamic users. */
+ return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(s);
+ assert(link);
+
+ if (link == m->managed_oom_varlink)
+ m->managed_oom_varlink = varlink_unref(link);
+}
+
+static int manager_varlink_init_system(Manager *m) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->varlink_server)
+ return 1;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ r = manager_setup_varlink_server(m, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up varlink server: %m");
+
+ if (!MANAGER_IS_TEST_RUN(m)) {
+ (void) mkdir_p_label("/run/systemd/userdb", 0755);
+
+ FOREACH_STRING(address, "/run/systemd/userdb/io.systemd.DynamicUser", VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM) {
+ if (MANAGER_IS_RELOADING(m)) {
+ /* If manager is reloading, we skip listening on existing addresses, since
+ * the fd should be acquired later through deserialization. */
+ if (access(address, F_OK) >= 0)
+ continue;
+ if (errno != ENOENT)
+ return log_error_errno(errno,
+ "Failed to check if varlink socket '%s' exists: %m", address);
+ }
+
+ r = varlink_server_listen_address(s, address, 0666);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind to varlink socket '%s': %m", address);
+ }
+ }
+
+ r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ m->varlink_server = TAKE_PTR(s);
+ return 1;
+}
+
+static int vl_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ if (error_id)
+ log_debug("varlink systemd-oomd client error: %s", error_id);
+
+ if (FLAGS_SET(flags, VARLINK_REPLY_ERROR) && FLAGS_SET(flags, VARLINK_REPLY_LOCAL)) {
+ /* Varlink connection was closed, likely because of systemd-oomd restart. Let's try to
+ * reconnect and send the initial ManagedOOM update again. */
+
+ m->managed_oom_varlink = varlink_unref(link);
+
+ log_debug("Reconnecting to %s", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+
+ r = manager_varlink_init(m);
+ if (r <= 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int manager_varlink_init_user(Manager *m) {
+ _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
+ int r;
+
+ assert(m);
+
+ if (m->managed_oom_varlink)
+ return 1;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+ if (r < 0) {
+ if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) {
+ log_debug("systemd-oomd varlink unix socket not found, skipping user manager varlink setup");
+ return 0;
+ }
+ return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
+ }
+
+ varlink_set_userdata(link, m);
+
+ r = varlink_bind_reply(link, vl_reply);
+ if (r < 0)
+ return r;
+
+ r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ m->managed_oom_varlink = TAKE_PTR(link);
+
+ /* Queue the initial ManagedOOM update. */
+ (void) manager_varlink_send_managed_oom_initial(m);
+
+ return 1;
+}
+
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate varlink server object: %m");
+
+ varlink_server_set_userdata(s, m);
+
+ r = varlink_server_add_interface_many(
+ s,
+ &vl_interface_io_systemd_UserDatabase,
+ &vl_interface_io_systemd_ManagedOOM);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add interfaces to varlink server: %m");
+
+ r = varlink_server_bind_method_many(
+ s,
+ "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
+ "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
+ "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
+ "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink methods: %m");
+
+ r = varlink_server_bind_disconnect(s, vl_disconnect);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register varlink disconnect handler: %m");
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+int manager_varlink_init(Manager *m) {
+ return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m);
+}
+
+void manager_varlink_done(Manager *m) {
+ assert(m);
+
+ /* Explicitly close the varlink connection to oomd. Note we first take the varlink connection out of
+ * the manager, and only then disconnect it — in two steps – so that we don't end up accidentally
+ * unreffing it twice. After all, closing the connection might cause the disconnect handler we
+ * installed (vl_disconnect() above) to be called, where we will unref it too. */
+ varlink_close_unref(TAKE_PTR(m->managed_oom_varlink));
+
+ m->varlink_server = varlink_server_unref(m->varlink_server);
+ m->managed_oom_varlink = varlink_close_unref(m->managed_oom_varlink);
+}
diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h
new file mode 100644
index 0000000..7f810d1
--- /dev/null
+++ b/src/core/core-varlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "manager.h"
+
+int manager_varlink_init(Manager *m);
+void manager_varlink_done(Manager *m);
+
+/* Creates a new VarlinkServer and binds methods. Does not set up sockets or attach events.
+ * Used for manager serialize/deserialize. */
+int manager_setup_varlink_server(Manager *m, VarlinkServer **ret_s);
+
+/* The manager is expected to send an update to systemd-oomd if one of the following occurs:
+ * - The value of ManagedOOM*= properties change
+ * - A unit with ManagedOOM*= properties changes unit active state */
+int manager_varlink_send_managed_oom_update(Unit *u);
diff --git a/src/core/crash-handler.c b/src/core/crash-handler.c
new file mode 100644
index 0000000..f5c31b6
--- /dev/null
+++ b/src/core/crash-handler.c
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/reboot.h>
+
+#include "sd-messages.h"
+
+#include "crash-handler.h"
+#include "exit-status.h"
+#include "macro.h"
+#include "main.h"
+#include "missing_syscall.h"
+#include "process-util.h"
+#include "raw-clone.h"
+#include "rlimit-util.h"
+#include "signal-util.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void) {
+
+ /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
+ * the container manager, and thus inform it that something went wrong. */
+ if (detect_container() > 0) {
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Exiting PID 1..."),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_EXIT_STR);
+ _exit(EXIT_EXCEPTION);
+ }
+
+ if (arg_crash_reboot) {
+ log_notice("Rebooting in 10s...");
+ (void) sleep(10);
+
+ log_notice("Rebooting now...");
+ (void) reboot(RB_AUTOBOOT);
+ log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("Failed to reboot: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_FAILED_STR);
+ }
+
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Freezing execution."),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_FREEZE_STR);
+ sync();
+ freeze();
+}
+
+_noreturn_ static void crash(int sig, siginfo_t *siginfo, void *context) {
+ struct sigaction sa;
+ pid_t pid;
+
+ /* NB: 💣 💣 💣 This is a signal handler, most likely executed in a situation where we have corrupted
+ * memory. Thus: please avoid any libc memory allocation here, or any functions that internally use
+ * memory allocation, as we cannot rely on memory allocation still working at this point! (Note that
+ * memory allocation is not async-signal-safe anyway — see signal-safety(7) for details —, and thus
+ * is not permissible in signal handlers.) */
+
+ if (getpid_cached() != 1)
+ /* Pass this on immediately, if this is not PID 1 */
+ propagate_signal(sig, siginfo);
+ else if (!arg_dump_core)
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s>, not dumping core.", signal_to_string(sig)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_COREDUMP_STR);
+ else {
+ sa = (struct sigaction) {
+ .sa_handler = nop_signal_handler,
+ .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+ };
+
+ /* We want to wait for the core process, hence let's enable SIGCHLD */
+ (void) sigaction(SIGCHLD, &sa, NULL);
+
+ pid = raw_clone(SIGCHLD);
+ if (pid < 0)
+ log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_NO_FORK_STR);
+ else if (pid == 0) {
+ /* Enable default signal handler for core dump */
+
+ sa = (struct sigaction) {
+ .sa_handler = SIG_DFL,
+ };
+ (void) sigaction(sig, &sa, NULL);
+
+ /* Don't limit the coredump size */
+ (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
+
+ /* Just to be sure... */
+ (void) chdir("/");
+
+ /* Raise the signal again */
+ propagate_signal(sig, siginfo);
+ assert_not_reached();
+ _exit(EXIT_EXCEPTION);
+ } else {
+ siginfo_t status;
+ int r;
+
+ if (siginfo) {
+ if (siginfo->si_pid == 0)
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s>, from unknown sender process.", signal_to_string(sig)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_UNKNOWN_SIGNAL_STR);
+ else if (siginfo->si_pid == 1)
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s>, from our own process.", signal_to_string(sig)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_SYSTEMD_SIGNAL_STR);
+ else
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s> from PID "PID_FMT".", signal_to_string(sig), siginfo->si_pid),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_PROCESS_SIGNAL_STR);
+ }
+
+ /* Order things nicely. */
+ r = wait_for_terminate(pid, &status);
+ if (r < 0)
+ log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Caught <%s>, waitpid() failed: %m", signal_to_string(sig)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_WAITPID_FAILED_STR);
+ else if (status.si_code != CLD_DUMPED) {
+ const char *s = status.si_code == CLD_EXITED ?
+ exit_status_to_string(status.si_status, EXIT_STATUS_LIBC) :
+ signal_to_string(status.si_status);
+
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
+ signal_to_string(sig),
+ pid,
+ sigchld_code_to_string(status.si_code),
+ status.si_status,
+ strna(s)),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_FAILED_STR);
+ } else
+ log_struct(LOG_EMERG,
+ LOG_MESSAGE("Caught <%s>, dumped core as pid "PID_FMT".",
+ signal_to_string(sig), pid),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_COREDUMP_PID_STR);
+ }
+ }
+
+ if (arg_crash_chvt >= 0)
+ (void) chvt(arg_crash_chvt);
+
+ sa = (struct sigaction) {
+ .sa_handler = SIG_IGN,
+ .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
+ };
+
+ /* Let the kernel reap children for us */
+ (void) sigaction(SIGCHLD, &sa, NULL);
+
+ if (arg_crash_shell) {
+ log_notice("Executing crash shell in 10s...");
+ (void) sleep(10);
+
+ pid = raw_clone(SIGCHLD);
+ if (pid < 0)
+ log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("Failed to fork off crash shell: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_SHELL_FORK_FAILED_STR);
+ else if (pid == 0) {
+ (void) setsid();
+ (void) make_console_stdio();
+ (void) rlimit_nofile_safe();
+ (void) execle("/bin/sh", "/bin/sh", NULL, environ);
+
+ log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("execle() failed: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CRASH_EXECLE_FAILED_STR);
+ _exit(EXIT_EXCEPTION);
+ } else {
+ log_info("Spawned crash shell as PID "PID_FMT".", pid);
+ (void) wait_for_terminate(pid, NULL);
+ }
+ }
+
+ freeze_or_exit_or_reboot();
+}
+
+void install_crash_handler(void) {
+ static const struct sigaction sa = {
+ .sa_sigaction = crash,
+ .sa_flags = SA_NODEFER | SA_SIGINFO, /* So that we can raise the signal again from the signal handler */
+ };
+ int r;
+
+ /* We ignore the return value here, since, we don't mind if we cannot set up a crash handler */
+ r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER);
+ if (r < 0)
+ log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
+}
diff --git a/src/core/crash-handler.h b/src/core/crash-handler.h
new file mode 100644
index 0000000..dc14335
--- /dev/null
+++ b/src/core/crash-handler.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+_noreturn_ void freeze_or_exit_or_reboot(void);
+void install_crash_handler(void);
diff --git a/src/core/dbus-automount.c b/src/core/dbus-automount.c
new file mode 100644
index 0000000..881bf50
--- /dev/null
+++ b/src/core/dbus-automount.c
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "automount.h"
+#include "bus-get-properties.h"
+#include "dbus-automount.h"
+#include "dbus-util.h"
+#include "string-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, automount_result, AutomountResult);
+
+const sd_bus_vtable bus_automount_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Automount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtraOptions", "s", NULL, offsetof(Automount, extra_options), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Automount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Automount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutIdleUSec", "t", bus_property_get_usec, offsetof(Automount, timeout_idle_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_automount_set_transient_property(
+ Automount *a,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(a);
+
+ assert(a);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Where"))
+ return bus_set_transient_path(u, name, &a->where, message, flags, error);
+
+ if (streq(name, "ExtraOptions"))
+ return bus_set_transient_string(u, name, &a->extra_options, message, flags, error);
+
+ if (streq(name, "TimeoutIdleUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &a->timeout_idle_usec, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &a->directory_mode, message, flags, error);
+
+ return 0;
+}
+
+int bus_automount_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Automount *a = AUTOMOUNT(u);
+
+ assert(a);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB) /* This is a transient unit? let's load a little more */
+ return bus_automount_set_transient_property(a, name, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-automount.h b/src/core/dbus-automount.h
new file mode 100644
index 0000000..cfceaec
--- /dev/null
+++ b/src/core/dbus-automount.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_automount_vtable[];
+
+int bus_automount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
new file mode 100644
index 0000000..8a9570f
--- /dev/null
+++ b/src/core/dbus-cgroup.c
@@ -0,0 +1,2287 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "core-varlink.h"
+#include "dbus-cgroup.h"
+#include "dbus-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "in-addr-prefix-util.h"
+#include "ip-protocol-list.h"
+#include "limits-util.h"
+#include "memstream-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "socket-util.h"
+
+BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", CGroupTasksMax, cgroup_tasks_max_resolve);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch);
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference);
+
+static int property_get_cgroup_mask(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupMask *mask = userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (CGroupController ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
+ if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
+ continue;
+
+ r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_delegate_controllers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ if (!c->delegate)
+ return sd_bus_message_append(reply, "as", 0);
+
+ return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
+}
+
+static int property_get_cpuset(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CPUSet *cpus = ASSERT_PTR(userdata);
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) cpu_set_to_dbus(cpus, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_io_device_weight(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_weights, w, c->io_device_weights) {
+ r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_limits(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_limits, l, c->io_device_limits) {
+ CGroupIOLimitType type;
+
+ type = cgroup_io_limit_type_from_string(property);
+ if (type < 0 || l->limits[type] == cgroup_io_limit_defaults[type])
+ continue;
+
+ r = sd_bus_message_append(reply, "(st)", l->path, l->limits[type]);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_io_device_latency(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_latencies, l, c->io_device_latencies) {
+ r = sd_bus_message_append(reply, "(st)", l->path, l->target_usec);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_weight(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+ r = sd_bus_message_append(reply, "(st)", w->path, w->weight);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_blockio_device_bandwidths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ uint64_t v;
+
+ if (streq(property, "BlockIOReadBandwidth"))
+ v = b->rbps;
+ else
+ v = b->wbps;
+
+ if (v == CGROUP_LIMIT_MAX)
+ continue;
+
+ r = sd_bus_message_append(reply, "(st)", b->path, v);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_device_allow(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(device_allow, a, c->device_allow) {
+ r = sd_bus_message_append(reply, "(ss)", a->path, cgroup_device_permissions_to_string(a->permissions));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_ip_address_access(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Set **prefixes = ASSERT_PTR(userdata);
+ struct in_addr_prefix *i;
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(iayu)");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(i, *prefixes) {
+
+ r = sd_bus_message_open_container(reply, 'r', "iayu");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "i", i->family);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family));
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bpf_foreign_program(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ CGroupContext *c = userdata;
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(programs, p, c->bpf_foreign_programs) {
+ const char *attach_type = bpf_cgroup_attach_type_to_string(p->attach_type);
+
+ r = sd_bus_message_append(reply, "(ss)", attach_type, p->bpffs_path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_socket_bind(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupSocketBindItem **items = ASSERT_PTR(userdata);
+ int r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(iiqq)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(socket_bind_items, i, *items) {
+ r = sd_bus_message_append(reply, "(iiqq)", i->address_family, i->ip_protocol, i->nr_ports, i->port_min);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_restrict_network_interfaces(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ CGroupContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->restrict_network_interfaces_is_allow_list);
+ if (r < 0)
+ return r;
+
+ r = bus_message_append_string_set(reply, c->restrict_network_interfaces);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_cgroup_nft_set(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ int r;
+ CGroupContext *c = userdata;
+
+ assert(bus);
+ assert(reply);
+ assert(c);
+
+ r = sd_bus_message_open_container(reply, 'a', "(iiss)");
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+ r = sd_bus_message_append(reply, "(iiss)", nft_set->source, nft_set->nfproto, nft_set->table, nft_set->set);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_cgroup_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
+ SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+ SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
+ SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
+ SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
+ SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
+ SD_BUS_PROPERTY("CPUShares", "t", NULL, offsetof(CGroupContext, cpu_shares), 0),
+ SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0),
+ SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0),
+ SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0),
+ SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0),
+ SD_BUS_PROPERTY("StartupAllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_cpus), 0),
+ SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0),
+ SD_BUS_PROPERTY("StartupAllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, startup_cpuset_mems), 0),
+ SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0),
+ SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0),
+ SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0),
+ SD_BUS_PROPERTY("IODeviceWeight", "a(st)", property_get_io_device_weight, 0, 0),
+ SD_BUS_PROPERTY("IOReadBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOWriteBandwidthMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOReadIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IOWriteIOPSMax", "a(st)", property_get_io_device_limits, 0, 0),
+ SD_BUS_PROPERTY("IODeviceLatencyTargetUSec", "a(st)", property_get_io_device_latency, 0, 0),
+ SD_BUS_PROPERTY("BlockIOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, blockio_accounting), 0),
+ SD_BUS_PROPERTY("BlockIOWeight", "t", NULL, offsetof(CGroupContext, blockio_weight), 0),
+ SD_BUS_PROPERTY("StartupBlockIOWeight", "t", NULL, offsetof(CGroupContext, startup_blockio_weight), 0),
+ SD_BUS_PROPERTY("BlockIODeviceWeight", "a(st)", property_get_blockio_device_weight, 0, 0),
+ SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+ SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
+ SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0),
+ SD_BUS_PROPERTY("DefaultMemoryLow", "t", NULL, offsetof(CGroupContext, default_memory_low), 0),
+ SD_BUS_PROPERTY("DefaultStartupMemoryLow", "t", NULL, offsetof(CGroupContext, default_startup_memory_low), 0),
+ SD_BUS_PROPERTY("DefaultMemoryMin", "t", NULL, offsetof(CGroupContext, default_memory_min), 0),
+ SD_BUS_PROPERTY("MemoryMin", "t", NULL, offsetof(CGroupContext, memory_min), 0),
+ SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0),
+ SD_BUS_PROPERTY("StartupMemoryLow", "t", NULL, offsetof(CGroupContext, startup_memory_low), 0),
+ SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0),
+ SD_BUS_PROPERTY("StartupMemoryHigh", "t", NULL, offsetof(CGroupContext, startup_memory_high), 0),
+ SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0),
+ SD_BUS_PROPERTY("StartupMemoryMax", "t", NULL, offsetof(CGroupContext, startup_memory_max), 0),
+ SD_BUS_PROPERTY("MemorySwapMax", "t", NULL, offsetof(CGroupContext, memory_swap_max), 0),
+ SD_BUS_PROPERTY("StartupMemorySwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_swap_max), 0),
+ SD_BUS_PROPERTY("MemoryZSwapMax", "t", NULL, offsetof(CGroupContext, memory_zswap_max), 0),
+ SD_BUS_PROPERTY("StartupMemoryZSwapMax", "t", NULL, offsetof(CGroupContext, startup_memory_zswap_max), 0),
+ SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
+ SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
+ SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
+ SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0),
+ SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0),
+ SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
+ SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
+ SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
+ SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0),
+ SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0),
+ SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
+ SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
+ SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
+ SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0),
+ SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
+ SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0),
+ SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0),
+ SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0),
+ SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0),
+ SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0),
+ SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_cgroup_set_transient_property(
+ Unit *u,
+ CGroupContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Delegate")) {
+ int b;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->delegate = b;
+ c->delegate_controllers = b ? CGROUP_MASK_DELEGATE : 0;
+
+ unit_write_settingf(u, flags, name, "Delegate=%s", yes_no(b));
+ }
+
+ return 1;
+
+ } else if (streq(name, "DelegateSubgroup")) {
+ const char *s;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s) && cg_needs_escape(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (isempty(s))
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ else {
+ r = free_and_strdup_warn(&c->delegate_subgroup, s);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
+ CGroupMask mask = 0;
+
+ if (streq(name, "DelegateControllers") && !UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ CGroupController cc;
+ const char *t;
+
+ r = sd_bus_message_read(message, "s", &t);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ cc = cgroup_controller_from_string(t);
+ if (cc < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown cgroup controller '%s'", t);
+
+ mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *t = NULL;
+
+ r = cg_mask_to_string(mask, &t);
+ if (r < 0)
+ return r;
+
+ if (streq(name, "DelegateControllers")) {
+
+ c->delegate = true;
+ if (mask == 0)
+ c->delegate_controllers = 0;
+ else
+ c->delegate_controllers |= mask;
+
+ unit_write_settingf(u, flags, name, "Delegate=%s", strempty(t));
+
+ } else if (streq(name, "DisableControllers")) {
+
+ if (mask == 0)
+ c->disable_controllers = 0;
+ else
+ c->disable_controllers |= mask;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, strempty(t));
+ }
+ }
+
+ return 1;
+ } else if (STR_IN_SET(name, "IPIngressFilterPath", "IPEgressFilterPath")) {
+ char ***filters;
+ size_t n = 0;
+
+ filters = streq(name, "IPIngressFilterPath") ? &c->ip_filters_ingress : &c->ip_filters_egress;
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *path;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (!path_is_normalized(path) || !path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects a normalized absolute path.", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && !strv_contains(*filters, path)) {
+ r = strv_extend(filters, path);
+ if (r < 0)
+ return log_oom();
+ }
+ n++;
+ }
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ *filters = strv_free(*filters);
+
+ unit_invalidate_cgroup_bpf(u);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs(name, f);
+ fputs("=\n", f);
+
+ STRV_FOREACH(entry, *filters)
+ fprintf(f, "%s=%s\n", name, *entry);
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+
+ if (*filters) {
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ static bool warned = false;
+
+ log_full(warned ? LOG_DEBUG : LOG_WARNING,
+ "Transient unit %s configures an IP firewall with BPF, but the local system does not support BPF/cgroup firewalling with multiple filters.\n"
+ "Starting this unit will fail! (This warning is only shown for the first started transient unit using IP firewalling.)", u->id);
+ warned = true;
+ }
+ }
+ }
+
+ return 1;
+ } else if (streq(name, "BPFProgram")) {
+ const char *a, *p;
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &a, &p)) > 0) {
+ int attach_type = bpf_cgroup_attach_type_from_string(a);
+ if (attach_type < 0)
+ return sd_bus_error_setf(
+ error,
+ SD_BUS_ERROR_INVALID_ARGS,
+ "%s expects a valid BPF attach type, got '%s'.",
+ name, a);
+
+ if (!path_is_normalized(p) || !path_is_absolute(p))
+ return sd_bus_error_setf(
+ error,
+ SD_BUS_ERROR_INVALID_ARGS,
+ "%s= expects a normalized absolute path.",
+ name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = cgroup_context_add_bpf_foreign_program(c, attach_type, p);
+ if (r < 0)
+ return r;
+ }
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs(name, f);
+ fputs("=\n", f);
+
+ LIST_FOREACH(programs, fp, c->bpf_foreign_programs)
+ fprintf(f, "%s=%s:%s\n", name,
+ bpf_cgroup_attach_type_to_string(fp->attach_type),
+ fp->bpffs_path);
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+
+ if (c->bpf_foreign_programs) {
+ r = bpf_foreign_supported();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ log_full(LOG_DEBUG,
+ "Transient unit %s configures a BPF program pinned to BPF "
+ "filesystem, but the local system does not support that.\n"
+ "Starting this unit will fail!", u->id);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "MemoryPressureWatch")) {
+ CGroupPressureWatch p;
+ const char *t;
+
+ r = sd_bus_message_read(message, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (isempty(t))
+ p = _CGROUP_PRESSURE_WATCH_INVALID;
+ else {
+ p = cgroup_pressure_watch_from_string(t);
+ if (p < 0)
+ return p;
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->memory_pressure_watch = p;
+ unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p)));
+ }
+
+ return 1;
+
+ } else if (streq(name, "MemoryPressureThresholdUSec")) {
+ uint64_t t;
+
+ r = sd_bus_message_read(message, "t", &t);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->memory_pressure_threshold_usec = t;
+
+ if (t == UINT64_MAX)
+ unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec=");
+ else
+ unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t);
+ }
+
+ return 1;
+ } else if (streq(name, "CoredumpReceive")) {
+ int b;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->coredump_receive = b;
+
+ unit_write_settingf(u, flags, name, "CoredumpReceive=%s", yes_no(b));
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_cgroup_set_boolean(
+ Unit *u,
+ const char *name,
+ bool *p,
+ CGroupMask mask,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int b, r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = b;
+ unit_invalidate_cgroup(u, mask);
+ unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(b));
+ }
+
+ return 1;
+}
+
+#define BUS_DEFINE_SET_CGROUP_WEIGHT(function, mask, check, val) \
+ static int bus_cgroup_set_##function( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "t", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ if (v == (val)) \
+ unit_write_settingf(u, flags, name, \
+ "%s=", name); \
+ else \
+ unit_write_settingf(u, flags, name, \
+ "%s=%" PRIu64, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_CGROUP_LIMIT(function, mask, scale, minimum) \
+ static int bus_cgroup_set_##function( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "t", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (v < minimum) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ if (v == CGROUP_LIMIT_MAX) \
+ unit_write_settingf(u, flags, name, \
+ "%s=infinity", name); \
+ else \
+ unit_write_settingf(u, flags, name, \
+ "%s=%" PRIu64, name, v); \
+ } \
+ \
+ return 1; \
+ } \
+ static int bus_cgroup_set_##function##_scale( \
+ Unit *u, \
+ const char *name, \
+ uint64_t *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ uint64_t v; \
+ uint32_t raw; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "u", &raw); \
+ if (r < 0) \
+ return r; \
+ \
+ v = scale(raw, UINT32_MAX); \
+ if (v < minimum || v >= UINT64_MAX) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Value specified in %s is out of range", name); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_invalidate_cgroup(u, mask); \
+ \
+ /* Prepare to chop off suffix */ \
+ assert_se(endswith(name, "Scale")); \
+ \
+ int scaled = UINT32_SCALE_TO_PERMYRIAD(raw); \
+ unit_write_settingf(u, flags, name, "%.*s=" PERMYRIAD_AS_PERCENT_FORMAT_STR, \
+ (int)(strlen(name) - strlen("Scale")), name, \
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(scaled)); \
+ } \
+ \
+ return 1; \
+ }
+
+DISABLE_WARNING_TYPE_LIMITS;
+BUS_DEFINE_SET_CGROUP_WEIGHT(cpu_shares, CGROUP_MASK_CPU, CGROUP_CPU_SHARES_IS_OK, CGROUP_CPU_SHARES_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(io_weight, CGROUP_MASK_IO, CGROUP_WEIGHT_IS_OK, CGROUP_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEIGHT_IS_OK, CGROUP_BLKIO_WEIGHT_INVALID);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1);
+BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+BUS_DEFINE_SET_CGROUP_LIMIT(zswap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
+REENABLE_WARNING;
+
+static int bus_cgroup_set_cpu_weight(
+ Unit *u,
+ const char *name,
+ uint64_t *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+ uint64_t v;
+ int r;
+ assert(p);
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+ if (!CGROUP_WEIGHT_IS_OK(v) && v != CGROUP_WEIGHT_IDLE)
+ return sd_bus_error_setf(
+ error, SD_BUS_ERROR_INVALID_ARGS, "Value specified in %s is out of range", name);
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+ if (v == CGROUP_WEIGHT_INVALID)
+ unit_write_settingf(u, flags, name, "%s=", name);
+ else if (v == CGROUP_WEIGHT_IDLE)
+ unit_write_settingf(u, flags, name, "%s=idle", name);
+ else
+ unit_write_settingf(u, flags, name, "%s=%" PRIu64, name, v);
+ }
+ return 1;
+}
+
+static int bus_cgroup_set_tasks_max(
+ Unit *u,
+ const char *name,
+ CGroupTasksMax *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint64_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+
+ if (v < 1)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Value specified in %s is out of range", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = (CGroupTasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */
+ unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+ if (v == CGROUP_LIMIT_MAX)
+ unit_write_settingf(u, flags, name,
+ "%s=infinity", name);
+ else
+ unit_write_settingf(u, flags, name,
+ "%s=%" PRIu64, name, v);
+ }
+
+ return 1;
+}
+
+static int bus_cgroup_set_tasks_max_scale(
+ Unit *u,
+ const char *name,
+ CGroupTasksMax *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint32_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "u", &v);
+ if (r < 0)
+ return r;
+
+ if (v < 1 || v >= UINT32_MAX)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Value specified in %s is out of range", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = (CGroupTasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */
+ unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
+
+ uint32_t scaled = DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX);
+ unit_write_settingf(u, flags, name, "%s=%" PRIu32 ".%" PRIu32 "%%", "TasksMax",
+ scaled / 10, scaled % 10);
+ }
+
+ return 1;
+}
+
+int bus_cgroup_set_property(
+ Unit *u,
+ CGroupContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ CGroupIOLimitType iol_type;
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "CPUAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->cpu_accounting, get_cpu_accounting_mask(), message, flags, error);
+
+ if (streq(name, "CPUWeight"))
+ return bus_cgroup_set_cpu_weight(u, name, &c->cpu_weight, message, flags, error);
+
+ if (streq(name, "StartupCPUWeight"))
+ return bus_cgroup_set_cpu_weight(u, name, &c->startup_cpu_weight, message, flags, error);
+
+ if (streq(name, "CPUShares"))
+ return bus_cgroup_set_cpu_shares(u, name, &c->cpu_shares, message, flags, error);
+
+ if (streq(name, "StartupCPUShares"))
+ return bus_cgroup_set_cpu_shares(u, name, &c->startup_cpu_shares, message, flags, error);
+
+ if (streq(name, "IOAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->io_accounting, CGROUP_MASK_IO, message, flags, error);
+
+ if (streq(name, "IOWeight"))
+ return bus_cgroup_set_io_weight(u, name, &c->io_weight, message, flags, error);
+
+ if (streq(name, "StartupIOWeight"))
+ return bus_cgroup_set_io_weight(u, name, &c->startup_io_weight, message, flags, error);
+
+ if (streq(name, "BlockIOAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->blockio_accounting, CGROUP_MASK_BLKIO, message, flags, error);
+
+ if (streq(name, "BlockIOWeight"))
+ return bus_cgroup_set_blockio_weight(u, name, &c->blockio_weight, message, flags, error);
+
+ if (streq(name, "StartupBlockIOWeight"))
+ return bus_cgroup_set_blockio_weight(u, name, &c->startup_blockio_weight, message, flags, error);
+
+ if (streq(name, "MemoryAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->memory_accounting, CGROUP_MASK_MEMORY, message, flags, error);
+
+ if (streq(name, "MemoryMin")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->memory_min, message, flags, error);
+ if (r > 0)
+ c->memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->memory_low, message, flags, error);
+ if (r > 0)
+ c->memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "StartupMemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->startup_memory_low, message, flags, error);
+ if (r > 0)
+ c->startup_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryMin")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_min, message, flags, error);
+ if (r > 0)
+ c->default_memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->default_memory_low, message, flags, error);
+ if (r > 0)
+ c->default_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultStartupMemoryLow")) {
+ r = bus_cgroup_set_memory_protection(u, name, &c->default_startup_memory_low, message, flags, error);
+ if (r > 0)
+ c->default_startup_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryHigh"))
+ return bus_cgroup_set_memory(u, name, &c->memory_high, message, flags, error);
+
+ if (streq(name, "StartupMemoryHigh")) {
+ r = bus_cgroup_set_memory(u, name, &c->startup_memory_high, message, flags, error);
+ if (r > 0)
+ c->startup_memory_high_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemorySwapMax"))
+ return bus_cgroup_set_swap(u, name, &c->memory_swap_max, message, flags, error);
+
+ if (streq(name, "StartupMemorySwapMax")) {
+ r = bus_cgroup_set_swap(u, name, &c->startup_memory_swap_max, message, flags, error);
+ if (r > 0)
+ c->startup_memory_swap_max_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryZSwapMax"))
+ return bus_cgroup_set_zswap(u, name, &c->memory_zswap_max, message, flags, error);
+
+ if (streq(name, "StartupMemoryZSwapMax")) {
+ r = bus_cgroup_set_zswap(u, name, &c->startup_memory_zswap_max, message, flags, error);
+ if (r > 0)
+ c->startup_memory_zswap_max_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryMax"))
+ return bus_cgroup_set_memory(u, name, &c->memory_max, message, flags, error);
+
+ if (streq(name, "StartupMemoryMax")) {
+ r = bus_cgroup_set_memory(u, name, &c->startup_memory_max, message, flags, error);
+ if (r > 0)
+ c->startup_memory_max_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryLimit"))
+ return bus_cgroup_set_memory(u, name, &c->memory_limit, message, flags, error);
+
+ if (streq(name, "MemoryMinScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_min, message, flags, error);
+ if (r > 0)
+ c->memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryLowScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->memory_low, message, flags, error);
+ if (r > 0)
+ c->memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryMinScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_min, message, flags, error);
+ if (r > 0)
+ c->default_memory_min_set = true;
+ return r;
+ }
+
+ if (streq(name, "DefaultMemoryLowScale")) {
+ r = bus_cgroup_set_memory_protection_scale(u, name, &c->default_memory_low, message, flags, error);
+ if (r > 0)
+ c->default_memory_low_set = true;
+ return r;
+ }
+
+ if (streq(name, "MemoryHighScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_high, message, flags, error);
+
+ if (streq(name, "MemorySwapMaxScale"))
+ return bus_cgroup_set_swap_scale(u, name, &c->memory_swap_max, message, flags, error);
+
+ if (streq(name, "MemoryZSwapMaxScale"))
+ return bus_cgroup_set_zswap_scale(u, name, &c->memory_zswap_max, message, flags, error);
+
+ if (streq(name, "MemoryMaxScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_max, message, flags, error);
+
+ if (streq(name, "MemoryLimitScale"))
+ return bus_cgroup_set_memory_scale(u, name, &c->memory_limit, message, flags, error);
+
+ if (streq(name, "TasksAccounting"))
+ return bus_cgroup_set_boolean(u, name, &c->tasks_accounting, CGROUP_MASK_PIDS, message, flags, error);
+
+ if (streq(name, "TasksMax"))
+ return bus_cgroup_set_tasks_max(u, name, &c->tasks_max, message, flags, error);
+
+ if (streq(name, "TasksMaxScale"))
+ return bus_cgroup_set_tasks_max_scale(u, name, &c->tasks_max, message, flags, error);
+
+ if (streq(name, "CPUQuotaPerSecUSec")) {
+ uint64_t u64;
+
+ r = sd_bus_message_read(message, "t", &u64);
+ if (r < 0)
+ return r;
+
+ if (u64 <= 0)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "CPUQuotaPerSecUSec= value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_quota_per_sec_usec = u64;
+ u->warned_clamping_cpu_quota_period = false;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+
+ if (c->cpu_quota_per_sec_usec == USEC_INFINITY)
+ unit_write_setting(u, flags, "CPUQuota", "CPUQuota=");
+ else
+ /* config_parse_cpu_quota() requires an integer, so truncating division is used on
+ * purpose here. */
+ unit_write_settingf(u, flags, "CPUQuota",
+ "CPUQuota=%0.f%%",
+ (double) (c->cpu_quota_per_sec_usec / 10000));
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUQuotaPeriodUSec")) {
+ uint64_t u64;
+
+ r = sd_bus_message_read(message, "t", &u64);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_quota_period_usec = u64;
+ u->warned_clamping_cpu_quota_period = false;
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPU);
+ if (c->cpu_quota_period_usec == USEC_INFINITY)
+ unit_write_setting(u, flags, "CPUQuotaPeriodSec", "CPUQuotaPeriodSec=");
+ else
+ unit_write_settingf(u, flags, "CPUQuotaPeriodSec",
+ "CPUQuotaPeriodSec=%s",
+ FORMAT_TIMESPAN(c->cpu_quota_period_usec, 1));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "AllowedCPUs", "StartupAllowedCPUs", "AllowedMemoryNodes", "StartupAllowedMemoryNodes")) {
+ const void *a;
+ size_t n;
+ _cleanup_(cpu_set_reset) CPUSet new_set = {};
+
+ r = sd_bus_message_read_array(message, 'y', &a, &n);
+ if (r < 0)
+ return r;
+
+ r = cpu_set_from_dbus(a, n, &new_set);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *setstr = NULL;
+ CPUSet *set = NULL;
+
+ setstr = cpu_set_to_range_string(&new_set);
+ if (!setstr)
+ return -ENOMEM;
+
+ if (streq(name, "AllowedCPUs"))
+ set = &c->cpuset_cpus;
+ else if (streq(name, "StartupAllowedCPUs"))
+ set = &c->startup_cpuset_cpus;
+ else if (streq(name, "AllowedMemoryNodes"))
+ set = &c->cpuset_mems;
+ else if (streq(name, "StartupAllowedMemoryNodes"))
+ set = &c->startup_cpuset_mems;
+
+ assert(set);
+
+ cpu_set_reset(set);
+ *set = new_set;
+ new_set = (CPUSet) {};
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET);
+ unit_write_settingf(u, flags, name, "%s=\n%s=%s", name, name, setstr);
+ }
+
+ return 1;
+
+ } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) {
+ const char *path;
+ unsigned n = 0;
+ uint64_t u64;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceLimit *a = NULL;
+
+ LIST_FOREACH(device_limits, b, c->io_device_limits)
+ if (path_equal(path, b->path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ CGroupIOLimitType type;
+
+ a = new0(CGroupIODeviceLimit, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+
+ for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
+ a->limits[type] = cgroup_io_limit_defaults[type];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, a);
+ }
+
+ a->limits[iol_type] = u64;
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ LIST_FOREACH(device_limits, a, c->io_device_limits)
+ a->limits[iol_type] = cgroup_io_limit_defaults[iol_type];
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fprintf(f, "%s=\n", name);
+ LIST_FOREACH(device_limits, a, c->io_device_limits)
+ if (a->limits[iol_type] != cgroup_io_limit_defaults[iol_type])
+ fprintf(f, "%s=%s %" PRIu64 "\n", name, a->path, a->limits[iol_type]);
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IODeviceWeight")) {
+ const char *path;
+ uint64_t weight;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!CGROUP_WEIGHT_IS_OK(weight) || weight == CGROUP_WEIGHT_INVALID)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "IODeviceWeight= value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceWeight *a = NULL;
+
+ LIST_FOREACH(device_weights, b, c->io_device_weights)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceWeight, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_weights, c->io_device_weights, a);
+ }
+
+ a->weight = weight;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("IODeviceWeight=\n", f);
+ LIST_FOREACH(device_weights, a, c->io_device_weights)
+ fprintf(f, "IODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IODeviceLatencyTargetUSec")) {
+ const char *path;
+ uint64_t target;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &target)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupIODeviceLatency *a = NULL;
+
+ LIST_FOREACH(device_latencies, b, c->io_device_latencies)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceLatency, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_latencies, c->io_device_latencies, a);
+ }
+
+ a->target_usec = target;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_IO);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("IODeviceLatencyTargetSec=\n", f);
+ LIST_FOREACH(device_latencies, a, c->io_device_latencies)
+ fprintf(f, "IODeviceLatencyTargetSec=%s %s\n",
+ a->path, FORMAT_TIMESPAN(a->target_usec, 1));
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) {
+ const char *path;
+ unsigned n = 0;
+ uint64_t u64;
+ bool read;
+
+ read = streq(name, "BlockIOReadBandwidth");
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &u64)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupBlockIODeviceBandwidth *a = NULL;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ if (path_equal(path, b->path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->rbps = CGROUP_LIMIT_MAX;
+ a->wbps = CGROUP_LIMIT_MAX;
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+ }
+
+ if (read)
+ a->rbps = u64;
+ else
+ a->wbps = u64;
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths) {
+ if (read)
+ a->rbps = CGROUP_LIMIT_MAX;
+ else
+ a->wbps = CGROUP_LIMIT_MAX;
+ }
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ if (read) {
+ fputs("BlockIOReadBandwidth=\n", f);
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+ if (a->rbps != CGROUP_LIMIT_MAX)
+ fprintf(f, "BlockIOReadBandwidth=%s %" PRIu64 "\n", a->path, a->rbps);
+ } else {
+ fputs("BlockIOWriteBandwidth=\n", f);
+ LIST_FOREACH(device_bandwidths, a, c->blockio_device_bandwidths)
+ if (a->wbps != CGROUP_LIMIT_MAX)
+ fprintf(f, "BlockIOWriteBandwidth=%s %" PRIu64 "\n", a->path, a->wbps);
+ }
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "BlockIODeviceWeight")) {
+ const char *path;
+ uint64_t weight;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &path, &weight)) > 0) {
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' specified in %s= is not normalized.", name, path);
+
+ if (!CGROUP_BLKIO_WEIGHT_IS_OK(weight) || weight == CGROUP_BLKIO_WEIGHT_INVALID)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "BlockIODeviceWeight= out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ CGroupBlockIODeviceWeight *a = NULL;
+
+ LIST_FOREACH(device_weights, b, c->blockio_device_weights)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceWeight, 1);
+ if (!a)
+ return -ENOMEM;
+
+ a->path = strdup(path);
+ if (!a->path) {
+ free(a);
+ return -ENOMEM;
+ }
+ LIST_PREPEND(device_weights, c->blockio_device_weights, a);
+ }
+
+ a->weight = weight;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_BLKIO);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("BlockIODeviceWeight=\n", f);
+ LIST_FOREACH(device_weights, a, c->blockio_device_weights)
+ fprintf(f, "BlockIODeviceWeight=%s %" PRIu64 "\n", a->path, a->weight);
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "DevicePolicy")) {
+ const char *policy;
+ CGroupDevicePolicy p;
+
+ r = sd_bus_message_read(message, "s", &policy);
+ if (r < 0)
+ return r;
+
+ p = cgroup_device_policy_from_string(policy);
+ if (p < 0)
+ return p;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->device_policy = p;
+ unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+ unit_write_settingf(u, flags, name, "DevicePolicy=%s", policy);
+ }
+
+ return 1;
+
+ } else if (streq(name, "DeviceAllow")) {
+ const char *path, *rwm;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &path, &rwm)) > 0) {
+ CGroupDevicePermissions p;
+
+ if (!valid_device_allow_pattern(path) || strpbrk(path, WHITESPACE))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires device node or pattern");
+
+ if (isempty(rwm))
+ p = _CGROUP_DEVICE_PERMISSIONS_ALL;
+ else {
+ p = cgroup_device_permissions_from_string(rwm);
+ if (p < 0)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "DeviceAllow= requires combination of rwm flags");
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = cgroup_context_add_or_update_device_allow(c, path, p);
+ if (r < 0)
+ return r;
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ unit_invalidate_cgroup(u, CGROUP_MASK_DEVICES);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("DeviceAllow=\n", f);
+ LIST_FOREACH(device_allow, a, c->device_allow)
+ fprintf(f, "DeviceAllow=%s %s\n", a->path, cgroup_device_permissions_to_string(a->permissions));
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IPAccounting")) {
+ int b;
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->ip_accounting = b;
+
+ unit_invalidate_cgroup_bpf(u);
+ unit_write_settingf(u, flags, name, "IPAccounting=%s", yes_no(b));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) {
+ _cleanup_set_free_ Set *new_prefixes = NULL;
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(iayu)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const void *ap;
+ int32_t family;
+ uint32_t prefixlen;
+ size_t an;
+
+ r = sd_bus_message_enter_container(message, 'r', "iayu");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = sd_bus_message_read(message, "i", &family);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(family, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects IPv4 or IPv6 addresses only.", name);
+
+ r = sd_bus_message_read_array(message, 'y', &ap, &an);
+ if (r < 0)
+ return r;
+
+ if (an != FAMILY_ADDRESS_SIZE(family))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "IP address has wrong size for family (%s, expected %zu, got %zu)",
+ af_to_name(family), FAMILY_ADDRESS_SIZE(family), an);
+
+ r = sd_bus_message_read(message, "u", &prefixlen);
+ if (r < 0)
+ return r;
+
+ if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Prefix length %" PRIu32 " too large for address family %s.", prefixlen, af_to_name(family));
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ struct in_addr_prefix prefix = {
+ .family = family,
+ .prefixlen = prefixlen,
+ };
+
+ memcpy(&prefix.address, ap, an);
+
+ r = in_addr_prefix_add(&new_prefixes, &prefix);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ Set **prefixes;
+ bool *reduced;
+ FILE *f;
+
+ unit_invalidate_cgroup_bpf(u);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ prefixes = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny;
+ reduced = streq(name, "IPAddressAllow") ? &c->ip_address_allow_reduced : &c->ip_address_deny_reduced;
+
+ if (n == 0) {
+ *reduced = true;
+ *prefixes = set_free(*prefixes);
+ fputs(name, f);
+ fputs("=\n", f);
+ } else {
+ *reduced = false;
+
+ r = in_addr_prefixes_merge(prefixes, new_prefixes);
+ if (r < 0)
+ return r;
+
+ const struct in_addr_prefix *p;
+ SET_FOREACH(p, new_prefixes)
+ fprintf(f, "%s=%s\n", name,
+ IN_ADDR_PREFIX_TO_STRING(p->family, &p->address, p->prefixlen));
+ }
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+ }
+
+ if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) {
+ ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure;
+ ManagedOOMMode m;
+ const char *mode;
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = sd_bus_message_read(message, "s", &mode);
+ if (r < 0)
+ return r;
+
+ m = managed_oom_mode_from_string(mode);
+ if (m < 0)
+ return -EINVAL;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *cgroup_mode = m;
+ unit_write_settingf(u, flags, name, "%s=%s", name, mode);
+ }
+
+ (void) manager_varlink_send_managed_oom_update(u);
+ return 1;
+ }
+
+ if (streq(name, "ManagedOOMMemoryPressureLimit")) {
+ uint32_t v;
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = sd_bus_message_read(message, "u", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->moom_mem_pressure_limit = v;
+ unit_write_settingf(u, flags, name,
+ "ManagedOOMMemoryPressureLimit=" PERMYRIAD_AS_PERCENT_FORMAT_STR,
+ PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(v)));
+ }
+
+ if (c->moom_mem_pressure == MANAGED_OOM_KILL)
+ (void) manager_varlink_send_managed_oom_update(u);
+
+ return 1;
+ }
+
+ if (streq(name, "ManagedOOMPreference")) {
+ ManagedOOMPreference p;
+ const char *pref;
+
+ r = sd_bus_message_read(message, "s", &pref);
+ if (r < 0)
+ return r;
+
+ p = managed_oom_preference_from_string(pref);
+ if (p < 0)
+ return p;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->moom_preference = p;
+ unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref);
+ }
+
+ return 1;
+ }
+ if (STR_IN_SET(name, "SocketBindAllow", "SocketBindDeny")) {
+ CGroupSocketBindItem **list;
+ uint16_t nr_ports, port_min;
+ size_t n = 0;
+ int32_t family, ip_protocol;
+
+ list = streq(name, "SocketBindAllow") ? &c->socket_bind_allow : &c->socket_bind_deny;
+
+ r = sd_bus_message_enter_container(message, 'a', "(iiqq)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(iiqq)", &family, &ip_protocol, &nr_ports, &port_min)) > 0) {
+
+ if (!IN_SET(family, AF_UNSPEC, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects INET or INET6 family, if specified.", name);
+
+ if (!IN_SET(ip_protocol, 0, IPPROTO_TCP, IPPROTO_UDP))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects TCP or UDP protocol, if specified.", name);
+
+ if (port_min + (uint32_t) nr_ports > (1 << 16))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects maximum port value lesser than 65536.", name);
+
+ if (port_min == 0 && nr_ports != 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= expects port range starting with positive value.", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ CGroupSocketBindItem *item = NULL;
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom();
+
+ *item = (CGroupSocketBindItem) {
+ .address_family = family,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min
+ };
+
+ LIST_PREPEND(socket_bind_items, *list, TAKE_PTR(item));
+ }
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ cgroup_context_remove_socket_bind(list);
+ else {
+ if ((u->manager->cgroup_supported & CGROUP_MASK_BPF_SOCKET_BIND) == 0)
+ log_full(LOG_DEBUG,
+ "Unit %s configures source compiled BPF programs "
+ "but the local system does not support that.\n"
+ "Starting this unit will fail!", u->id);
+ }
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ if (n == 0)
+ fprintf(f, "%s=\n", name);
+ else
+ LIST_FOREACH(socket_bind_items, item, *list) {
+ fprintf(f, "%s=", name);
+ cgroup_context_dump_socket_bind_item(item, f);
+ fputc('\n', f);
+ }
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, buf);
+ }
+
+ return 1;
+ }
+ if (streq(name, "RestrictNetworkInterfaces")) {
+ int is_allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &is_allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l)) {
+ c->restrict_network_interfaces_is_allow_list = false;
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ return 1;
+ }
+
+ if (set_isempty(c->restrict_network_interfaces))
+ c->restrict_network_interfaces_is_allow_list = is_allow_list;
+
+ STRV_FOREACH(s, l) {
+ if (!ifname_valid(*s)) {
+ log_full(LOG_WARNING, "Invalid interface name, ignoring: %s", *s);
+ continue;
+ }
+ if (c->restrict_network_interfaces_is_allow_list != (bool) is_allow_list)
+ free(set_remove(c->restrict_network_interfaces, *s));
+ else {
+ r = set_put_strdup(&c->restrict_network_interfaces, *s);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s%s", name, is_allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "NFTSet")) {
+ int source, nfproto;
+ const char *table, *set;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(iiss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(iiss)", &source, &nfproto, &table, &set)) > 0) {
+ const char *source_name, *nfproto_name;
+
+ if (!IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid source %d.", source);
+
+ source_name = nft_set_source_to_string(source);
+ assert(source_name);
+
+ nfproto_name = nfproto_to_string(nfproto);
+ if (!nfproto_name)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid protocol %d.", nfproto);
+
+ if (!nft_identifier_valid(table)) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(table);
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT table name %s.", strna(esc));
+ }
+
+ if (!nft_identifier_valid(set)) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(set);
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NFT set name %s.", strna(esc));
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = nft_set_add(&c->nft_set_context, source, nfproto, table, set);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s:%s:%s:%s",
+ name,
+ source_name,
+ nfproto_name,
+ table,
+ set);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (empty && !UNIT_WRITE_FLAGS_NOOP(flags)) {
+ nft_set_context_clear(&c->nft_set_context);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+ }
+
+ /* must be last */
+ if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
+ return bus_cgroup_set_transient_property(u, c, name, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h
new file mode 100644
index 0000000..dd0d5da
--- /dev/null
+++ b/src/core/dbus-cgroup.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+#include "cgroup.h"
+
+extern const sd_bus_vtable bus_cgroup_vtable[];
+
+int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_cgroup_pressure_watch(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-device.c b/src/core/dbus-device.c
new file mode 100644
index 0000000..b5e18d8
--- /dev/null
+++ b/src/core/dbus-device.c
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-device.h"
+#include "device.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_device_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("SysFSPath", "s", NULL, offsetof(Device, sysfs), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-device.h b/src/core/dbus-device.h
new file mode 100644
index 0000000..bfb5770
--- /dev/null
+++ b/src/core/dbus-device.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_device_vtable[];
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
new file mode 100644
index 0000000..2d05ba7
--- /dev/null
+++ b/src/core/dbus-execute.c
@@ -0,0 +1,3758 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "dbus-execute.h"
+#include "dbus-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "iovec-util.h"
+#include "ioprio-util.h"
+#include "journal-file.h"
+#include "load-fragment.h"
+#include "memstream-util.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pcre2-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_output, exec_output, ExecOutput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInput);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
+static BUS_DEFINE_PROPERTY_GET(property_get_ioprio, "i", ExecContext, exec_context_get_effective_ioprio);
+static BUS_DEFINE_PROPERTY_GET(property_get_mount_apivfs, "b", ExecContext, exec_context_get_effective_mount_apivfs);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_class, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_class);
+static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext, exec_context_get_effective_ioprio, ioprio_prio_data);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
+static BUS_DEFINE_PROPERTY_GET(property_get_oom_score_adjust, "i", ExecContext, exec_context_get_oom_score_adjust);
+static BUS_DEFINE_PROPERTY_GET(property_get_nice, "i", ExecContext, exec_context_get_nice);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_policy, "i", ExecContext, exec_context_get_cpu_sched_policy);
+static BUS_DEFINE_PROPERTY_GET(property_get_cpu_sched_priority, "i", ExecContext, exec_context_get_cpu_sched_priority);
+static BUS_DEFINE_PROPERTY_GET(property_get_coredump_filter, "t", ExecContext, exec_context_get_coredump_filter);
+static BUS_DEFINE_PROPERTY_GET(property_get_timer_slack_nsec, "t", ExecContext, exec_context_get_timer_slack_nsec);
+
+static int property_get_environment_files(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sb)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(j, c->environment_files) {
+ const char *fn = *j;
+
+ r = sd_bus_message_append(reply, "(sb)", fn[0] == '-' ? fn + 1 : fn, fn[0] == '-');
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_cpu_affinity(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->cpu_affinity_from_numa) {
+ int r;
+
+ r = numa_to_cpu_set(&c->numa_policy, &s);
+ if (r < 0)
+ return r;
+ }
+
+ (void) cpu_set_to_dbus(c->cpu_affinity_from_numa ? &s : &c->cpu_set, &array, &allocated);
+
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_mask(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
+
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_policy(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ ExecContext *c = ASSERT_PTR(userdata);
+ int32_t policy;
+
+ assert(bus);
+ assert(reply);
+
+ policy = numa_policy_get_type(&c->numa_policy);
+
+ return sd_bus_message_append_basic(reply, 'i', &policy);
+}
+
+static int property_get_syscall_filter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->syscall_allow_list);
+ if (r < 0)
+ return r;
+
+ l = exec_context_get_syscall_filter(c);
+ if (!l)
+ return -ENOMEM;
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_log(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->syscall_log_allow_list);
+ if (r < 0)
+ return r;
+
+ l = exec_context_get_syscall_log(c);
+ if (!l)
+ return -ENOMEM;
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_syscall_archs(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ l = exec_context_get_syscall_archs(c);
+ if (!l)
+ return -ENOMEM;
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int property_get_selinux_context(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->selinux_context_ignore, c->selinux_context);
+}
+
+static int property_get_apparmor_profile(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->apparmor_profile_ignore, c->apparmor_profile);
+}
+
+static int property_get_smack_process_label(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "(bs)", c->smack_process_label_ignore, c->smack_process_label);
+}
+
+static int property_get_address_families(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->address_families_allow_list);
+ if (r < 0)
+ return r;
+
+ l = exec_context_get_address_families(c);
+ if (!l)
+ return -ENOMEM;
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_working_directory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ const char *wd;
+
+ assert(bus);
+ assert(reply);
+
+ if (c->working_directory_home)
+ wd = "~";
+ else
+ wd = c->working_directory;
+
+ if (c->working_directory_missing_ok)
+ wd = strjoina("!", wd);
+
+ return sd_bus_message_append(reply, "s", wd);
+}
+
+static int property_get_stdio_fdname(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int fileno;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ if (streq(property, "StandardInputFileDescriptorName"))
+ fileno = STDIN_FILENO;
+ else if (streq(property, "StandardOutputFileDescriptorName"))
+ fileno = STDOUT_FILENO;
+ else {
+ assert(streq(property, "StandardErrorFileDescriptorName"));
+ fileno = STDERR_FILENO;
+ }
+
+ return sd_bus_message_append(reply, "s", exec_context_fdname(c, fileno));
+}
+
+static int property_get_input_data(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size);
+}
+
+static int property_get_restrict_filesystems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ _cleanup_free_ char **l = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "b", c->restrict_filesystems_allow_list);
+ if (r < 0)
+ return r;
+
+ l = exec_context_get_restrict_filesystems(c);
+ if (!l)
+ return -ENOMEM;
+
+ r = sd_bus_message_append_strv(reply, l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_bind_paths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ bool ro;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ ro = strstr(property, "ReadOnly");
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssbt)");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_bind_mounts; i++) {
+
+ if (ro != c->bind_mounts[i].read_only)
+ continue;
+
+ r = sd_bus_message_append(
+ reply, "(ssbt)",
+ c->bind_mounts[i].source,
+ c->bind_mounts[i].destination,
+ c->bind_mounts[i].ignore_enoent,
+ c->bind_mounts[i].recursive ? (uint64_t) MS_REC : UINT64_C(0));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_temporary_filesystems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ for (unsigned i = 0; i < c->n_temporary_filesystems; i++) {
+ TemporaryFileSystem *t = c->temporary_filesystems + i;
+
+ r = sd_bus_message_append(
+ reply, "(ss)",
+ t->path,
+ t->options);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_log_extra_fields(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "ay");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+ r = sd_bus_message_append_array(reply, 'y', c->log_extra_fields[i].iov_base, c->log_extra_fields[i].iov_len);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int sd_bus_message_append_log_filter_patterns(sd_bus_message *reply, Set *patterns, bool is_allowlist) {
+ const char *pattern;
+ int r;
+
+ assert(reply);
+
+ SET_FOREACH(pattern, patterns) {
+ r = sd_bus_message_append(reply, "(bs)", is_allowlist, pattern);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int property_get_log_filter_patterns(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = userdata;
+ int r;
+
+ assert(c);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(bs)");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_allowed_patterns,
+ /* is_allowlist = */ true);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_log_filter_patterns(reply, c->log_filter_denied_patterns,
+ /* is_allowlist = */ false);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_set_credential(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ ExecSetCredential *sc;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(say)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(sc, c->set_credentials) {
+
+ if (sc->encrypted != streq(property, "SetCredentialEncrypted"))
+ continue;
+
+ r = sd_bus_message_open_container(reply, 'r', "say");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", sc->id);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(reply, 'y', sc->data, sc->size);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_credential(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ ExecLoadCredential *lc;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(lc, c->load_credentials) {
+
+ if (lc->encrypted != streq(property, "LoadCredentialEncrypted"))
+ continue;
+
+ r = sd_bus_message_append(reply, "(ss)", lc->id, lc->path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_root_hash(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->root_hash, c->root_hash_size);
+}
+
+static int property_get_root_hash_sig(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return sd_bus_message_append_array(reply, 'y', c->root_hash_sig, c->root_hash_sig_size);
+}
+
+static int property_get_root_image_options(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(mount_options, m, c->root_image_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_mount_images(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssba(ss))");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_mount_images; i++) {
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "ssba(ss)");
+ if (r < 0)
+ return r;
+ r = sd_bus_message_append(
+ reply, "ssb",
+ c->mount_images[i].source,
+ c->mount_images[i].destination,
+ c->mount_images[i].ignore_enoent);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+ LIST_FOREACH(mount_options, m, c->mount_images[i].mount_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_extension_images(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sba(ss))");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < c->n_extension_images; i++) {
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_STRUCT, "sba(ss)");
+ if (r < 0)
+ return r;
+ r = sd_bus_message_append(
+ reply, "sb",
+ c->extension_images[i].source,
+ c->extension_images[i].ignore_enoent);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+ LIST_FOREACH(mount_options, m, c->extension_images[i].mount_options) {
+ r = sd_bus_message_append(reply, "(ss)",
+ partition_designator_to_string(m->partition_designator),
+ m->options);
+ if (r < 0)
+ return r;
+ }
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecDirectory *d = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < d->n_items; i++) {
+ r = sd_bus_message_append_basic(reply, 's', d->items[i].path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_property_get_exec_dir_symlink(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecDirectory *d = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < d->n_items; i++)
+ STRV_FOREACH(dst, d->items[i].symlinks) {
+ r = sd_bus_message_append(reply, "(sst)", d->items[i].path, *dst, UINT64_C(0) /* flags, unused for now */);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_image_policy(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ImagePolicy **pp = ASSERT_PTR(userdata);
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ r = image_policy_to_string(*pp ?: &image_policy_service, /* simplify= */ true, &s);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_append(reply, "s", s);
+}
+
+const sd_bus_vtable bus_exec_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("EnvironmentFiles", "a(sb)", property_get_environment_files, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassEnvironment", "as", NULL, offsetof(ExecContext, pass_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UnsetEnvironment", "as", NULL, offsetof(ExecContext, unset_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UMask", "u", bus_property_get_mode, offsetof(ExecContext, umask), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCPU", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCPUSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitFSIZE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitDATA", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitDATASoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSTACK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCORE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitCORESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRSS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRSSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNOFILE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitAS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitASSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNPROC", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitLOCKS", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNICE", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitNICESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTPRIO", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTTIME", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImageOptions", "a(ss)", property_get_root_image_options, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHash", "ay", property_get_root_hash, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashPath", "s", NULL, offsetof(ExecContext, root_hash_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CoredumpFilter", "t", property_get_coredump_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IOSchedulingClass", "i", property_get_ioprio_class, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IOSchedulingPriority", "i", property_get_ioprio_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUAffinityFromNUMA", "b", property_get_cpu_affinity_from_numa, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardInputData", "ay", property_get_input_data, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_stdio_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYVTDisallocate", "b", bus_property_get_bool, offsetof(ExecContext, tty_vt_disallocate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYRows", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_rows), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TTYColumns", "q", bus_property_get_unsigned, offsetof(ExecContext, tty_cols), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogPriority", "i", bus_property_get_int, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogIdentifier", "s", NULL, offsetof(ExecContext, syslog_identifier), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogLevelPrefix", "b", bus_property_get_bool, offsetof(ExecContext, syslog_level_prefix), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogLevel", "i", property_get_syslog_level, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SyslogFacility", "i", property_get_syslog_facility, offsetof(ExecContext, syslog_priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogLevelMax", "i", bus_property_get_int, offsetof(ExecContext, log_level_max), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogRateLimitIntervalUSec", "t", bus_property_get_usec, offsetof(ExecContext, log_ratelimit_interval_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogRateLimitBurst", "u", bus_property_get_unsigned, offsetof(ExecContext, log_ratelimit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogExtraFields", "aay", property_get_log_extra_fields, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogFilterPatterns", "a(bs)", property_get_log_filter_patterns, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogNamespace", "s", NULL, offsetof(ExecContext, log_namespace), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SecureBits", "i", bus_property_get_int, offsetof(ExecContext, secure_bits), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CapabilityBoundingSet", "t", NULL, offsetof(ExecContext, capability_bounding_set), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AmbientCapabilities", "t", NULL, offsetof(ExecContext, capability_ambient_set), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetLoginEnvironment", "b", bus_property_get_tristate, offsetof(ExecContext, set_login_environment), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetCredential", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SetCredentialEncrypted", "a(say)", property_get_set_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadCredential", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadCredentialEncrypted", "a(ss)", property_get_load_credential, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ImportCredential", "as", bus_property_get_string_set, offsetof(ExecContext, import_credentials), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadWritePaths", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadOnlyPaths", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExecPaths", "as", NULL, offsetof(ExecContext, exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_propagation_flag), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UtmpIdentifier", "s", NULL, offsetof(ExecContext, utmp_id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UtmpMode", "s", property_get_exec_utmp_mode, offsetof(ExecContext, utmp_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SELinuxContext", "(bs)", property_get_selinux_context, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AppArmorProfile", "(bs)", property_get_apparmor_profile, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackProcessLabel", "(bs)", property_get_smack_process_label, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IgnoreSIGPIPE", "b", bus_property_get_bool, offsetof(ExecContext, ignore_sigpipe), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoNewPrivileges", "b", bus_property_get_bool, offsetof(ExecContext, no_new_privileges), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallFilter", "(bas)", property_get_syscall_filter, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallErrorNumber", "i", bus_property_get_int, offsetof(ExecContext, syscall_errno), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SystemCallLog", "(bas)", property_get_syscall_log, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Personality", "s", property_get_personality, offsetof(ExecContext, personality), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", bus_property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StateDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_STATE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CacheDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CACHE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectorySymlink", "a(sst)", bus_property_get_exec_dir_symlink, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LogsDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_LOGS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConfigurationDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION].mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConfigurationDirectory", "as", bus_property_get_exec_dir, offsetof(ExecContext, directories[EXEC_DIRECTORY_CONFIGURATION]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutCleanUSec", "t", bus_property_get_usec, offsetof(ExecContext, timeout_clean_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountAPIVFS", "b", property_get_mount_apivfs, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, root_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, mount_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExtensionImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, extension_image_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ /* Obsolete/redundant properties: */
+ SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("ReadOnlyDirectories", "as", NULL, offsetof(ExecContext, read_only_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("InaccessibleDirectories", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+ SD_BUS_VTABLE_END
+};
+
+static int append_exec_command(sd_bus_message *reply, ExecCommand *c) {
+ int r;
+
+ assert(reply);
+ assert(c);
+
+ if (!c->path)
+ return 0;
+
+ r = sd_bus_message_open_container(reply, 'r', "sasbttttuii");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", c->path);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, c->argv);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "bttttuii",
+ !!(c->flags & EXEC_COMMAND_IGNORE_FAILURE),
+ c->exec_status.start_timestamp.realtime,
+ c->exec_status.start_timestamp.monotonic,
+ c->exec_status.exit_timestamp.realtime,
+ c->exec_status.exit_timestamp.monotonic,
+ (uint32_t) c->exec_status.pid,
+ (int32_t) c->exec_status.code,
+ (int32_t) c->exec_status.status);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int append_exec_ex_command(sd_bus_message *reply, ExecCommand *c) {
+ _cleanup_strv_free_ char **ex_opts = NULL;
+ int r;
+
+ assert(reply);
+ assert(c);
+
+ if (!c->path)
+ return 0;
+
+ r = sd_bus_message_open_container(reply, 'r', "sasasttttuii");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", c->path);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, c->argv);
+ if (r < 0)
+ return r;
+
+ r = exec_command_flags_to_strv(c->flags, &ex_opts);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_strv(reply, ex_opts);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "ttttuii",
+ c->exec_status.start_timestamp.realtime,
+ c->exec_status.start_timestamp.monotonic,
+ c->exec_status.exit_timestamp.realtime,
+ c->exec_status.exit_timestamp.monotonic,
+ (uint32_t) c->exec_status.pid,
+ (int32_t) c->exec_status.code,
+ (int32_t) c->exec_status.status);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *c = (ExecCommand*) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+ if (r < 0)
+ return r;
+
+ r = append_exec_command(reply, c);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_command_list(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *exec_command = *(ExecCommand**) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasbttttuii)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(command, c, exec_command) {
+ r = append_exec_command(reply, c);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_property_get_exec_ex_command_list(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *ret_error) {
+
+ ExecCommand *exec_command = *(ExecCommand**) userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sasasttttuii)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(command, c, exec_command) {
+ r = append_exec_ex_command(reply, c);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static char *exec_command_flags_to_exec_chars(ExecCommandFlags flags) {
+ return strjoin(FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE) ? "-" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND) ? ":" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_FULLY_PRIVILEGED) ? "+" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID) ? "!" : "",
+ FLAGS_SET(flags, EXEC_COMMAND_AMBIENT_MAGIC) ? "!!" : "");
+}
+
+int bus_set_transient_exec_command(
+ Unit *u,
+ const char *name,
+ ExecCommand **exec_command,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+ bool is_ex_prop = endswith(name, "Ex");
+ unsigned n = 0;
+ int r;
+
+ /* Drop Ex from the written setting. E.g. ExecStart=, not ExecStartEx=. */
+ const char *written_name = is_ex_prop ? strndupa(name, strlen(name) - 2) : name;
+
+ r = sd_bus_message_enter_container(message, 'a', is_ex_prop ? "(sasas)" : "(sasb)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_enter_container(message, 'r', is_ex_prop ? "sasas" : "sasb")) > 0) {
+ _cleanup_strv_free_ char **argv = NULL, **ex_opts = NULL;
+ const char *path;
+ int b;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path) && !filename_is_valid(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "\"%s\" is neither a valid executable name nor an absolute path",
+ path);
+
+ r = sd_bus_message_read_strv(message, &argv);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(argv))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "\"%s\" argv cannot be empty", name);
+
+ r = is_ex_prop ? sd_bus_message_read_strv(message, &ex_opts) : sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ ExecCommand *c;
+
+ c = new0(ExecCommand, 1);
+ if (!c)
+ return -ENOMEM;
+
+ c->path = strdup(path);
+ if (!c->path) {
+ free(c);
+ return -ENOMEM;
+ }
+
+ c->argv = TAKE_PTR(argv);
+
+ if (is_ex_prop) {
+ r = exec_command_flags_from_strv(ex_opts, &c->flags);
+ if (r < 0)
+ return r;
+ } else
+ c->flags = b ? EXEC_COMMAND_IGNORE_FAILURE : 0;
+
+ path_simplify(c->path);
+ exec_command_append_list(exec_command, c);
+ }
+
+ n++;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *buf = NULL;
+ FILE *f;
+
+ if (n == 0)
+ *exec_command = exec_command_free_list(*exec_command);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fprintf(f, "%s=\n", written_name);
+
+ LIST_FOREACH(command, c, *exec_command) {
+ _cleanup_free_ char *a = NULL, *exec_chars = NULL;
+ UnitWriteFlags esc_flags = UNIT_ESCAPE_SPECIFIERS |
+ (FLAGS_SET(c->flags, EXEC_COMMAND_NO_ENV_EXPAND) ? UNIT_ESCAPE_EXEC_SYNTAX : UNIT_ESCAPE_EXEC_SYNTAX_ENV);
+
+ exec_chars = exec_command_flags_to_exec_chars(c->flags);
+ if (!exec_chars)
+ return -ENOMEM;
+
+ a = unit_concat_strv(c->argv, esc_flags);
+ if (!a)
+ return -ENOMEM;
+
+ if (streq_ptr(c->path, c->argv ? c->argv[0] : NULL))
+ fprintf(f, "%s=%s%s\n", written_name, exec_chars, a);
+ else {
+ _cleanup_free_ char *t = NULL;
+ const char *p;
+
+ p = unit_escape_setting(c->path, esc_flags, &t);
+ if (!p)
+ return -ENOMEM;
+
+ fprintf(f, "%s=%s@%s %s\n", written_name, exec_chars, p, a);
+ }
+ }
+
+ r = memstream_finalize(&m, &buf, NULL);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, written_name, buf);
+ }
+
+ return 1;
+}
+
+static int parse_personality(const char *s, unsigned long *p) {
+ unsigned long v;
+
+ assert(p);
+
+ v = personality_from_string(s);
+ if (v == PERSONALITY_INVALID)
+ return -EINVAL;
+
+ *p = v;
+ return 0;
+}
+
+static const char* mount_propagation_flag_to_string_with_check(unsigned long n) {
+ if (!mount_propagation_flag_is_valid(n))
+ return NULL;
+
+ return mount_propagation_flag_to_string(n);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(nsec, "t", uint64_t, nsec_t, NSEC_FMT);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(log_level, "i", int32_t, int, "%" PRIi32, log_level_is_valid);
+#if HAVE_SECCOMP
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(errno, "i", int32_t, int, "%" PRIi32, seccomp_errno_or_action_is_valid);
+#endif
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_input, ExecInput, exec_input_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(std_output, ExecOutput, exec_output_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(capability, "t", uint64_t, uint64_t, "%" PRIu64, capability_set_to_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(namespace_flag, "t", uint64_t, unsigned long, "%" PRIu64, namespace_flags_to_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(mount_propagation_flag, "t", uint64_t, unsigned long, "%" PRIu64, mount_propagation_flag_to_string_with_check);
+
+int bus_exec_context_set_transient_property(
+ Unit *u,
+ ExecContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *suffix;
+ int r;
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "User"))
+ return bus_set_transient_user_relaxed(u, name, &c->user, message, flags, error);
+
+ if (streq(name, "Group"))
+ return bus_set_transient_user_relaxed(u, name, &c->group, message, flags, error);
+
+ if (streq(name, "SetLoginEnvironment"))
+ return bus_set_transient_tristate(u, name, &c->set_login_environment, message, flags, error);
+
+ if (streq(name, "TTYPath"))
+ return bus_set_transient_path(u, name, &c->tty_path, message, flags, error);
+
+ if (streq(name, "RootImage"))
+ return bus_set_transient_path(u, name, &c->root_image, message, flags, error);
+
+ if (streq(name, "RootImageOptions")) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *format_str = NULL;
+
+ r = bus_read_mount_options(message, error, &options, &format_str, " ");
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (options) {
+ LIST_JOIN(mount_options, c->root_image_options, options);
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s",
+ name,
+ format_str);
+ } else {
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHash")) {
+ const void *roothash_decoded;
+ size_t roothash_decoded_size;
+
+ r = sd_bus_message_read_array(message, 'y', &roothash_decoded, &roothash_decoded_size);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (roothash_decoded_size == 0) {
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+
+ unit_write_settingf(u, flags, name, "RootHash=");
+ } else {
+ _cleanup_free_ void *p = NULL;
+
+ encoded = hexmem(roothash_decoded, roothash_decoded_size);
+ if (!encoded)
+ return -ENOMEM;
+
+ p = memdup(roothash_decoded, roothash_decoded_size);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash, p);
+ c->root_hash_size = roothash_decoded_size;
+ c->root_hash_path = mfree(c->root_hash_path);
+
+ unit_write_settingf(u, flags, name, "RootHash=%s", encoded);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHashPath")) {
+ c->root_hash_size = 0;
+ c->root_hash = mfree(c->root_hash);
+
+ return bus_set_transient_path(u, "RootHash", &c->root_hash_path, message, flags, error);
+ }
+
+ if (streq(name, "RootHashSignature")) {
+ const void *roothash_sig_decoded;
+ size_t roothash_sig_decoded_size;
+
+ r = sd_bus_message_read_array(message, 'y', &roothash_sig_decoded, &roothash_sig_decoded_size);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (roothash_sig_decoded_size == 0) {
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+
+ unit_write_settingf(u, flags, name, "RootHashSignature=");
+ } else {
+ _cleanup_free_ void *p = NULL;
+ ssize_t len;
+
+ len = base64mem(roothash_sig_decoded, roothash_sig_decoded_size, &encoded);
+ if (len < 0)
+ return -ENOMEM;
+
+ p = memdup(roothash_sig_decoded, roothash_sig_decoded_size);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash_sig, p);
+ c->root_hash_sig_size = roothash_sig_decoded_size;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+ unit_write_settingf(u, flags, name, "RootHashSignature=base64:%s", encoded);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RootHashSignaturePath")) {
+ c->root_hash_sig_size = 0;
+ c->root_hash_sig = mfree(c->root_hash_sig);
+
+ return bus_set_transient_path(u, "RootHashSignature", &c->root_hash_sig_path, message, flags, error);
+ }
+
+ if (streq(name, "RootVerity"))
+ return bus_set_transient_path(u, name, &c->root_verity, message, flags, error);
+
+ if (streq(name, "RootDirectory"))
+ return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
+
+ if (streq(name, "RootEphemeral"))
+ return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
+
+ if (streq(name, "SyslogIdentifier"))
+ return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
+
+ if (streq(name, "LogLevelMax"))
+ return bus_set_transient_log_level(u, name, &c->log_level_max, message, flags, error);
+
+ if (streq(name, "LogRateLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &c->log_ratelimit_interval_usec, message, flags, error);
+
+ if (streq(name, "LogRateLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &c->log_ratelimit_burst, message, flags, error);
+
+ if (streq(name, "LogFilterPatterns")) {
+ /* Use _cleanup_free_, not _cleanup_strv_free_, as we don't want the content of the strv
+ * to be freed. */
+ _cleanup_free_ char **allow_list = NULL, **deny_list = NULL;
+ const char *pattern;
+ int is_allowlist;
+
+ r = sd_bus_message_enter_container(message, 'a', "(bs)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(bs)", &is_allowlist, &pattern)) > 0) {
+ _cleanup_(pattern_freep) pcre2_code *compiled_pattern = NULL;
+
+ if (isempty(pattern))
+ continue;
+
+ r = pattern_compile_and_log(pattern, 0, &compiled_pattern);
+ if (r < 0)
+ return r;
+
+ r = strv_push(is_allowlist ? &allow_list : &deny_list, (char *)pattern);
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(allow_list) && strv_isempty(deny_list)) {
+ c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+ c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ r = set_put_strdupv(&c->log_filter_allowed_patterns, allow_list);
+ if (r < 0)
+ return r;
+ r = set_put_strdupv(&c->log_filter_denied_patterns, deny_list);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(unit_pattern, allow_list)
+ unit_write_settingf(u, flags, name, "%s=%s", name, *unit_pattern);
+ STRV_FOREACH(unit_pattern, deny_list)
+ unit_write_settingf(u, flags, name, "%s=~%s", name, *unit_pattern);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "Personality"))
+ return bus_set_transient_personality(u, name, &c->personality, message, flags, error);
+
+ if (streq(name, "StandardInput"))
+ return bus_set_transient_std_input(u, name, &c->std_input, message, flags, error);
+
+ if (streq(name, "StandardOutput"))
+ return bus_set_transient_std_output(u, name, &c->std_output, message, flags, error);
+
+ if (streq(name, "StandardError"))
+ return bus_set_transient_std_output(u, name, &c->std_error, message, flags, error);
+
+ if (streq(name, "IgnoreSIGPIPE"))
+ return bus_set_transient_bool(u, name, &c->ignore_sigpipe, message, flags, error);
+
+ if (streq(name, "TTYVHangup"))
+ return bus_set_transient_bool(u, name, &c->tty_vhangup, message, flags, error);
+
+ if (streq(name, "TTYReset"))
+ return bus_set_transient_bool(u, name, &c->tty_reset, message, flags, error);
+
+ if (streq(name, "TTYVTDisallocate"))
+ return bus_set_transient_bool(u, name, &c->tty_vt_disallocate, message, flags, error);
+
+ if (streq(name, "TTYRows"))
+ return bus_set_transient_unsigned(u, name, &c->tty_rows, message, flags, error);
+
+ if (streq(name, "TTYColumns"))
+ return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error);
+
+ if (streq(name, "PrivateTmp"))
+ return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error);
+
+ if (streq(name, "PrivateDevices"))
+ return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
+
+ if (streq(name, "PrivateMounts"))
+ return bus_set_transient_tristate(u, name, &c->private_mounts, message, flags, error);
+
+ if (streq(name, "PrivateNetwork"))
+ return bus_set_transient_bool(u, name, &c->private_network, message, flags, error);
+
+ if (streq(name, "PrivateIPC"))
+ return bus_set_transient_bool(u, name, &c->private_ipc, message, flags, error);
+
+ if (streq(name, "PrivateUsers"))
+ return bus_set_transient_bool(u, name, &c->private_users, message, flags, error);
+
+ if (streq(name, "NoNewPrivileges"))
+ return bus_set_transient_bool(u, name, &c->no_new_privileges, message, flags, error);
+
+ if (streq(name, "SyslogLevelPrefix"))
+ return bus_set_transient_bool(u, name, &c->syslog_level_prefix, message, flags, error);
+
+ if (streq(name, "MemoryDenyWriteExecute"))
+ return bus_set_transient_bool(u, name, &c->memory_deny_write_execute, message, flags, error);
+
+ if (streq(name, "RestrictRealtime"))
+ return bus_set_transient_bool(u, name, &c->restrict_realtime, message, flags, error);
+
+ if (streq(name, "RestrictSUIDSGID"))
+ return bus_set_transient_bool(u, name, &c->restrict_suid_sgid, message, flags, error);
+
+ if (streq(name, "DynamicUser"))
+ return bus_set_transient_bool(u, name, &c->dynamic_user, message, flags, error);
+
+ if (streq(name, "RemoveIPC"))
+ return bus_set_transient_bool(u, name, &c->remove_ipc, message, flags, error);
+
+ if (streq(name, "ProtectKernelTunables"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_tunables, message, flags, error);
+
+ if (streq(name, "ProtectKernelModules"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_modules, message, flags, error);
+
+ if (streq(name, "ProtectKernelLogs"))
+ return bus_set_transient_bool(u, name, &c->protect_kernel_logs, message, flags, error);
+
+ if (streq(name, "ProtectClock"))
+ return bus_set_transient_bool(u, name, &c->protect_clock, message, flags, error);
+
+ if (streq(name, "ProtectControlGroups"))
+ return bus_set_transient_bool(u, name, &c->protect_control_groups, message, flags, error);
+
+ if (streq(name, "CPUSchedulingResetOnFork"))
+ return bus_set_transient_bool(u, name, &c->cpu_sched_reset_on_fork, message, flags, error);
+
+ if (streq(name, "NonBlocking"))
+ return bus_set_transient_bool(u, name, &c->non_blocking, message, flags, error);
+
+ if (streq(name, "LockPersonality"))
+ return bus_set_transient_bool(u, name, &c->lock_personality, message, flags, error);
+
+ if (streq(name, "ProtectHostname"))
+ return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error);
+
+ if (streq(name, "MemoryKSM"))
+ return bus_set_transient_tristate(u, name, &c->memory_ksm, message, flags, error);
+
+ if (streq(name, "UtmpIdentifier"))
+ return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error);
+
+ if (streq(name, "UtmpMode"))
+ return bus_set_transient_utmp_mode(u, name, &c->utmp_mode, message, flags, error);
+
+ if (streq(name, "PAMName"))
+ return bus_set_transient_string(u, name, &c->pam_name, message, flags, error);
+
+ if (streq(name, "TimerSlackNSec"))
+ return bus_set_transient_nsec(u, name, &c->timer_slack_nsec, message, flags, error);
+
+ if (streq(name, "ProtectSystem"))
+ return bus_set_transient_protect_system(u, name, &c->protect_system, message, flags, error);
+
+ if (streq(name, "ProtectHome"))
+ return bus_set_transient_protect_home(u, name, &c->protect_home, message, flags, error);
+
+ if (streq(name, "KeyringMode"))
+ return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
+
+ if (streq(name, "ProtectProc"))
+ return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
+
+ if (streq(name, "ProcSubset"))
+ return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+
+ if (streq(name, "RuntimeDirectoryPreserve"))
+ return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
+
+ if (streq(name, "UMask"))
+ return bus_set_transient_mode_t(u, name, &c->umask, message, flags, error);
+
+ if (streq(name, "RuntimeDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_RUNTIME].mode, message, flags, error);
+
+ if (streq(name, "StateDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_STATE].mode, message, flags, error);
+
+ if (streq(name, "CacheDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CACHE].mode, message, flags, error);
+
+ if (streq(name, "LogsDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_LOGS].mode, message, flags, error);
+
+ if (streq(name, "ConfigurationDirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &c->directories[EXEC_DIRECTORY_CONFIGURATION].mode, message, flags, error);
+
+ if (streq(name, "SELinuxContext"))
+ return bus_set_transient_string(u, name, &c->selinux_context, message, flags, error);
+
+ if (streq(name, "SecureBits"))
+ return bus_set_transient_secure_bits(u, name, &c->secure_bits, message, flags, error);
+
+ if (streq(name, "CapabilityBoundingSet"))
+ return bus_set_transient_capability(u, name, &c->capability_bounding_set, message, flags, error);
+
+ if (streq(name, "AmbientCapabilities"))
+ return bus_set_transient_capability(u, name, &c->capability_ambient_set, message, flags, error);
+
+ if (streq(name, "RestrictNamespaces"))
+ return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
+
+ if (streq(name, "RestrictFileSystems")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ FilesystemParseFlags invert_flag = allow_list ? 0 : FILESYSTEM_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->restrict_filesystems_allow_list = false;
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+ unit_write_setting(u, flags, name, "RestrictFileSystems=");
+ return 1;
+ }
+
+ if (!c->restrict_filesystems)
+ c->restrict_filesystems_allow_list = allow_list;
+
+ STRV_FOREACH(s, l) {
+ r = lsm_bpf_parse_filesystem(
+ *s,
+ &c->restrict_filesystems,
+ FILESYSTEM_PARSE_LOG|
+ (invert_flag ? FILESYSTEM_PARSE_INVERT : 0)|
+ (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+ u->id, NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s%s", name, allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "MountFlags"))
+ return bus_set_transient_mount_propagation_flag(u, name, &c->mount_propagation_flag, message, flags, error);
+
+ if (streq(name, "NetworkNamespacePath"))
+ return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error);
+
+ if (streq(name, "IPCNamespacePath"))
+ return bus_set_transient_path(u, name, &c->ipc_namespace_path, message, flags, error);
+
+ if (streq(name, "SupplementaryGroups")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!isempty(*p) && !valid_user_group_name(*p, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid supplementary group names");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->supplementary_groups = strv_free(c->supplementary_groups);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&c->supplementary_groups, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ joined = strv_join(c->supplementary_groups, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "SetCredential", "SetCredentialEncrypted")) {
+ bool isempty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(say)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *id;
+ const void *p;
+ size_t sz;
+
+ r = sd_bus_message_enter_container(message, 'r', "say");
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = sd_bus_message_read(message, "s", &id);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!credential_name_valid(id))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+ isempty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *a = NULL, *b = NULL;
+ _cleanup_free_ void *copy = NULL;
+ ExecSetCredential *old;
+
+ copy = memdup(p, sz);
+ if (!copy)
+ return -ENOMEM;
+
+ old = hashmap_get(c->set_credentials, id);
+ if (old) {
+ free_and_replace(old->data, copy);
+ old->size = sz;
+ old->encrypted = streq(name, "SetCredentialEncrypted");
+ } else {
+ _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+ sc = new(ExecSetCredential, 1);
+ if (!sc)
+ return -ENOMEM;
+
+ *sc = (ExecSetCredential) {
+ .id = strdup(id),
+ .data = TAKE_PTR(copy),
+ .size = sz,
+ .encrypted = streq(name, "SetCredentialEncrypted"),
+ };
+
+ if (!sc->id)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(sc);
+ }
+
+ a = specifier_escape(id);
+ if (!a)
+ return -ENOMEM;
+
+ b = cescape_length(p, sz);
+ if (!b)
+ return -ENOMEM;
+
+ (void) unit_write_settingf(u, flags, name, "%s=%s:%s", name, a, b);
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+ c->set_credentials = hashmap_free(c->set_credentials);
+ (void) unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "LoadCredential", "LoadCredentialEncrypted")) {
+ bool isempty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *id, *source;
+
+ r = sd_bus_message_read(message, "(ss)", &id, &source);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (!credential_name_valid(id))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential ID is invalid: %s", id);
+
+ if (!(path_is_absolute(source) ? path_is_normalized(source) : credential_name_valid(source)))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential source is invalid: %s", source);
+
+ isempty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ bool encrypted = streq(name, "LoadCredentialEncrypted");
+
+ r = hashmap_put_credential(&c->load_credentials, id, source, encrypted);
+ if (r < 0)
+ return r;
+
+ (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s:%s", name, id, source);
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+ c->load_credentials = hashmap_free(c->load_credentials);
+ (void) unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (streq(name, "ImportCredential")) {
+ bool isempty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (!credential_glob_valid(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Credential name or glob is invalid: %s", s);
+
+ isempty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = set_put_strdup(&c->import_credentials, s);
+ if (r < 0)
+ return r;
+
+ (void) unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, s);
+ }
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && isempty) {
+ c->import_credentials = set_free_free(c->import_credentials);
+ (void) unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SyslogLevel")) {
+ int32_t level;
+
+ r = sd_bus_message_read(message, "i", &level);
+ if (r < 0)
+ return r;
+
+ if (!log_level_is_valid(level))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log level value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->syslog_priority = (c->syslog_priority & LOG_FACMASK) | level;
+ unit_write_settingf(u, flags, name, "SyslogLevel=%i", level);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SyslogFacility")) {
+ int32_t facility;
+
+ r = sd_bus_message_read(message, "i", &facility);
+ if (r < 0)
+ return r;
+
+ if (!log_facility_unshifted_is_valid(facility))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log facility value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->syslog_priority = (facility << 3) | LOG_PRI(c->syslog_priority);
+ unit_write_settingf(u, flags, name, "SyslogFacility=%i", facility);
+ }
+
+ return 1;
+
+ } else if (streq(name, "LogNamespace")) {
+ const char *n;
+
+ r = sd_bus_message_read(message, "s", &n);
+ if (r < 0)
+ return r;
+
+ if (!isempty(n) && !log_namespace_name_valid(n))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Log namespace name not valid");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (isempty(n)) {
+ c->log_namespace = mfree(c->log_namespace);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ r = free_and_strdup(&c->log_namespace, n);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, n);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "LogExtraFields")) {
+ size_t n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "ay");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ void *copy = NULL;
+ struct iovec *t;
+ const char *eq;
+ const void *p;
+ size_t sz;
+
+ /* Note that we expect a byte array for each field, instead of a string. That's because on the
+ * lower-level journal fields can actually contain binary data and are not restricted to text,
+ * and we should not "lose precision" in our types on the way. That said, I am pretty sure
+ * actually encoding binary data as unit metadata is not a good idea. Hence we actually refuse
+ * any actual binary data, and only accept UTF-8. This allows us to eventually lift this
+ * limitation, should a good, valid use case arise. */
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (memchr(p, 0, sz))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains zero byte");
+
+ eq = memchr(p, '=', sz);
+ if (!eq)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field contains no '=' character");
+ if (!journal_field_valid(p, eq - (const char*) p, false))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field invalid");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+ if (!t)
+ return -ENOMEM;
+ c->log_extra_fields = t;
+ }
+
+ copy = malloc(sz + 1);
+ if (!copy)
+ return -ENOMEM;
+
+ memcpy(copy, p, sz);
+ ((uint8_t*) copy)[sz] = 0;
+
+ if (!utf8_is_valid(copy))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Journal field is not valid UTF-8");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE(copy, sz);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C, name, "LogExtraFields=%s", (char*) copy);
+
+ copy = NULL;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && n == 0) {
+ exec_context_free_log_extra_fields(c);
+ unit_write_setting(u, flags, name, "LogExtraFields=");
+ }
+
+ return 1;
+ }
+
+#if HAVE_SECCOMP
+
+ if (streq(name, "SystemCallErrorNumber"))
+ return bus_set_transient_errno(u, name, &c->syscall_errno, message, flags, error);
+
+ if (streq(name, "SystemCallFilter")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->syscall_allow_list = false;
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+
+ unit_write_settingf(u, flags, name, "SystemCallFilter=");
+ return 1;
+ }
+
+ if (!c->syscall_filter) {
+ c->syscall_filter = hashmap_new(NULL);
+ if (!c->syscall_filter)
+ return log_oom();
+
+ c->syscall_allow_list = allow_list;
+
+ if (c->syscall_allow_list) {
+ r = seccomp_parse_syscall_filter("@default",
+ -1,
+ c->syscall_filter,
+ SECCOMP_PARSE_PERMISSIVE |
+ SECCOMP_PARSE_ALLOW_LIST,
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ STRV_FOREACH(s, l) {
+ _cleanup_free_ char *n = NULL;
+ int e;
+
+ r = parse_syscall_and_errno(*s, &n, &e);
+ if (r < 0)
+ return r;
+
+ if (allow_list && e >= 0)
+ return -EINVAL;
+
+ r = seccomp_parse_syscall_filter(n,
+ e,
+ c->syscall_filter,
+ SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+ invert_flag |
+ (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "SystemCallFilter=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SystemCallLog")) {
+ int allow_list;
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+ SeccompParseFlags invert_flag = allow_list ? 0 : SECCOMP_PARSE_INVERT;
+
+ if (strv_isempty(l)) {
+ c->syscall_log_allow_list = false;
+ c->syscall_log = hashmap_free(c->syscall_log);
+
+ unit_write_settingf(u, flags, name, "SystemCallLog=");
+ return 1;
+ }
+
+ if (!c->syscall_log) {
+ c->syscall_log = hashmap_new(NULL);
+ if (!c->syscall_log)
+ return log_oom();
+
+ c->syscall_log_allow_list = allow_list;
+ }
+
+ STRV_FOREACH(s, l) {
+ r = seccomp_parse_syscall_filter(*s,
+ -1, /* errno not used */
+ c->syscall_log,
+ SECCOMP_PARSE_LOG | SECCOMP_PARSE_PERMISSIVE |
+ invert_flag |
+ (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ u->id,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "SystemCallLog=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "SystemCallArchitectures")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l))
+ c->syscall_archs = set_free(c->syscall_archs);
+ else
+ STRV_FOREACH(s, l) {
+ uint32_t a;
+
+ r = seccomp_arch_from_string(*s, &a);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_put(&c->syscall_archs, NULL, UINT32_TO_PTR(a + 1));
+ if (r < 0)
+ return r;
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+
+ return 1;
+
+ } else if (streq(name, "RestrictAddressFamilies")) {
+ _cleanup_strv_free_ char **l = NULL;
+ int allow_list;
+
+ r = sd_bus_message_enter_container(message, 'r', "bas");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &allow_list);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (strv_isempty(l)) {
+ c->address_families_allow_list = allow_list;
+ c->address_families = set_free(c->address_families);
+
+ unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s",
+ allow_list ? "none" : "");
+ return 1;
+ }
+
+ if (!c->address_families) {
+ c->address_families = set_new(NULL);
+ if (!c->address_families)
+ return log_oom();
+
+ c->address_families_allow_list = allow_list;
+ }
+
+ STRV_FOREACH(s, l) {
+ int af;
+
+ af = af_from_name(*s);
+ if (af < 0)
+ return af;
+
+ if (allow_list == c->address_families_allow_list) {
+ r = set_put(c->address_families, INT_TO_PTR(af));
+ if (r < 0)
+ return r;
+ } else
+ set_remove(c->address_families, INT_TO_PTR(af));
+ }
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "RestrictAddressFamilies=%s%s", allow_list ? "" : "~", joined);
+ }
+
+ return 1;
+ }
+#endif
+ if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
+ const void *a;
+ size_t n;
+ bool affinity = streq(name, "CPUAffinity");
+ _cleanup_(cpu_set_reset) CPUSet set = {};
+
+ r = sd_bus_message_read_array(message, 'y', &a, &n);
+ if (r < 0)
+ return r;
+
+ r = cpu_set_from_dbus(a, n, &set);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n == 0) {
+ cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *str = NULL;
+
+ str = cpu_set_to_string(&set);
+ if (!str)
+ return -ENOMEM;
+
+ /* We forego any optimizations here, and always create the structure using
+ * cpu_set_add_all(), because we don't want to care if the existing size we
+ * got over dbus is appropriate. */
+ r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, str);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUAffinityFromNUMA")) {
+ int q;
+
+ r = sd_bus_message_read_basic(message, 'b', &q);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_affinity_from_numa = q;
+ unit_write_settingf(u, flags, name, "%s=%s", "CPUAffinity", "numa");
+ }
+
+ return 1;
+
+ } else if (streq(name, "NUMAPolicy")) {
+ int32_t type;
+
+ r = sd_bus_message_read(message, "i", &type);
+ if (r < 0)
+ return r;
+
+ if (!mpol_is_valid(type))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ c->numa_policy.type = type;
+
+ return 1;
+
+ } else if (streq(name, "Nice")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!nice_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid Nice value: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->nice = q;
+ c->nice_set = true;
+
+ unit_write_settingf(u, flags, name, "Nice=%i", q);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUSchedulingPolicy")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!sched_policy_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling policy: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = sched_policy_to_string_alloc(q, &s);
+ if (r < 0)
+ return r;
+
+ c->cpu_sched_policy = q;
+ c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(q), sched_get_priority_max(q));
+ c->cpu_sched_set = true;
+
+ unit_write_settingf(u, flags, name, "CPUSchedulingPolicy=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CPUSchedulingPriority")) {
+ int32_t p;
+
+ r = sd_bus_message_read(message, "i", &p);
+ if (r < 0)
+ return r;
+
+ /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set
+ * later so we do not check the precise range, but only the generic outer bounds. */
+ if (p < 0 || p > 99)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid CPU scheduling priority: %i", p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->cpu_sched_priority = p;
+ c->cpu_sched_set = true;
+
+ unit_write_settingf(u, flags, name, "CPUSchedulingPriority=%i", p);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IOSchedulingClass")) {
+ int32_t q;
+
+ r = sd_bus_message_read(message, "i", &q);
+ if (r < 0)
+ return r;
+
+ if (!ioprio_class_is_valid(q))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling class: %i", q);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = ioprio_class_to_string_alloc(q, &s);
+ if (r < 0)
+ return r;
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(q, ioprio_prio_data(c->ioprio)));
+ c->ioprio_set = true;
+
+ unit_write_settingf(u, flags, name, "IOSchedulingClass=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "IOSchedulingPriority")) {
+ int32_t p;
+
+ r = sd_bus_message_read(message, "i", &p);
+ if (r < 0)
+ return r;
+
+ if (!ioprio_priority_is_valid(p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid IO scheduling priority: %i", p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), p));
+ c->ioprio_set = true;
+
+ unit_write_settingf(u, flags, name, "IOSchedulingPriority=%i", p);
+ }
+
+ return 1;
+
+ } else if (streq(name, "MountAPIVFS")) {
+ bool b;
+
+ r = bus_set_transient_bool(u, name, &b, message, flags, error);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->mount_apivfs = b;
+ c->mount_apivfs_set = true;
+ }
+
+ return 1;
+
+ } else if (streq(name, "WorkingDirectory")) {
+ const char *s;
+ bool missing_ok;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (s[0] == '-') {
+ missing_ok = true;
+ s++;
+ } else
+ missing_ok = false;
+
+ if (!isempty(s) && !streq(s, "~") && !path_is_absolute(s))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "WorkingDirectory= expects an absolute path or '~'");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (streq(s, "~")) {
+ c->working_directory = mfree(c->working_directory);
+ c->working_directory_home = true;
+ } else {
+ r = free_and_strdup(&c->working_directory, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->working_directory_home = false;
+ }
+
+ c->working_directory_missing_ok = missing_ok;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "WorkingDirectory=%s%s", missing_ok ? "-" : "", s);
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) {
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s) && !fdname_is_valid(s))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (streq(name, "StandardInputFileDescriptorName")) {
+ r = free_and_strdup(c->stdio_fdname + STDIN_FILENO, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_input = EXEC_INPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=fd:%s", exec_context_fdname(c, STDIN_FILENO));
+
+ } else if (streq(name, "StandardOutputFileDescriptorName")) {
+ r = free_and_strdup(c->stdio_fdname + STDOUT_FILENO, empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_output = EXEC_OUTPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=fd:%s", exec_context_fdname(c, STDOUT_FILENO));
+
+ } else {
+ assert(streq(name, "StandardErrorFileDescriptorName"));
+
+ r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_error = EXEC_OUTPUT_NAMED_FD;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=fd:%s", exec_context_fdname(c, STDERR_FILENO));
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "StandardInputFile",
+ "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate",
+ "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate")) {
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s)) {
+ if (!path_is_absolute(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute", s);
+ if (!path_is_normalized(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not normalized", s);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+
+ if (streq(name, "StandardInputFile")) {
+ r = free_and_strdup(&c->stdio_file[STDIN_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ c->std_input = EXEC_INPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardInput=file:%s", s);
+
+ } else if (STR_IN_SET(name, "StandardOutputFile", "StandardOutputFileToAppend", "StandardOutputFileToTruncate")) {
+ r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ if (streq(name, "StandardOutputFile")) {
+ c->std_output = EXEC_OUTPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=file:%s", s);
+ } else if (streq(name, "StandardOutputFileToAppend")) {
+ c->std_output = EXEC_OUTPUT_FILE_APPEND;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=append:%s", s);
+ } else {
+ assert(streq(name, "StandardOutputFileToTruncate"));
+ c->std_output = EXEC_OUTPUT_FILE_TRUNCATE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardOutput=truncate:%s", s);
+ }
+ } else {
+ assert(STR_IN_SET(name, "StandardErrorFile", "StandardErrorFileToAppend", "StandardErrorFileToTruncate"));
+
+ r = free_and_strdup(&c->stdio_file[STDERR_FILENO], empty_to_null(s));
+ if (r < 0)
+ return r;
+
+ if (streq(name, "StandardErrorFile")) {
+ c->std_error = EXEC_OUTPUT_FILE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=file:%s", s);
+ } else if (streq(name, "StandardErrorFileToAppend")) {
+ c->std_error = EXEC_OUTPUT_FILE_APPEND;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=append:%s", s);
+ } else {
+ assert(streq(name, "StandardErrorFileToTruncate"));
+ c->std_error = EXEC_OUTPUT_FILE_TRUNCATE;
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "StandardError=truncate:%s", s);
+ }
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "StandardInputData")) {
+ const void *p;
+ size_t sz;
+
+ r = sd_bus_message_read_array(message, 'y', &p, &sz);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *encoded = NULL;
+
+ if (sz == 0) {
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+
+ unit_write_settingf(u, flags, name, "StandardInputData=");
+ } else {
+ void *q;
+ ssize_t n;
+
+ if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX)
+ return -E2BIG;
+
+ n = base64mem(p, sz, &encoded);
+ if (n < 0)
+ return (int) n;
+
+ q = realloc(c->stdin_data, c->stdin_data_size + sz);
+ if (!q)
+ return -ENOMEM;
+
+ memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+ c->stdin_data = q;
+ c->stdin_data_size += sz;
+
+ unit_write_settingf(u, flags, name, "StandardInputData=%s", encoded);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Environment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment block.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->environment = strv_free(c->environment);
+ unit_write_setting(u, flags, name, "Environment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ char **e;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+ if (!joined)
+ return -ENOMEM;
+
+ e = strv_env_merge(c->environment, l);
+ if (!e)
+ return -ENOMEM;
+
+ strv_free_and_replace(c->environment, e);
+ unit_write_settingf(u, flags, name, "Environment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "UnsetEnvironment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid UnsetEnvironment= list.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->unset_environment = strv_free(c->unset_environment);
+ unit_write_setting(u, flags, name, "UnsetEnvironment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ char **e;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS|UNIT_ESCAPE_C);
+ if (!joined)
+ return -ENOMEM;
+
+ e = strv_env_merge(c->unset_environment, l);
+ if (!e)
+ return -ENOMEM;
+
+ strv_free_and_replace(c->unset_environment, e);
+ unit_write_settingf(u, flags, name, "UnsetEnvironment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "OOMScoreAdjust")) {
+ int oa;
+
+ r = sd_bus_message_read(message, "i", &oa);
+ if (r < 0)
+ return r;
+
+ if (!oom_score_adjust_is_valid(oa))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "OOM score adjust value out of range");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->oom_score_adjust = oa;
+ c->oom_score_adjust_set = true;
+ unit_write_settingf(u, flags, name, "OOMScoreAdjust=%i", oa);
+ }
+
+ return 1;
+
+ } else if (streq(name, "CoredumpFilter")) {
+ uint64_t f;
+
+ r = sd_bus_message_read(message, "t", &f);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->coredump_filter = f;
+ c->coredump_filter_set = true;
+ unit_write_settingf(u, flags, name, "CoredumpFilter=0x%"PRIx64, f);
+ }
+
+ return 1;
+
+ } else if (streq(name, "EnvironmentFiles")) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *joined = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ FILE *f;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sb)");
+ if (r < 0)
+ return r;
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ fputs("EnvironmentFile=\n", f);
+
+ STRV_FOREACH(i, c->environment_files) {
+ _cleanup_free_ char *q = NULL;
+
+ q = specifier_escape(*i);
+ if (!q)
+ return -ENOMEM;
+
+ fprintf(f, "EnvironmentFile=%s\n", q);
+ }
+
+ while ((r = sd_bus_message_enter_container(message, 'r', "sb")) > 0) {
+ const char *path;
+ int b;
+
+ r = sd_bus_message_read(message, "sb", &path, &b);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *q = NULL, *buf = NULL;
+
+ buf = strjoin(b ? "-" : "", path);
+ if (!buf)
+ return -ENOMEM;
+
+ q = specifier_escape(buf);
+ if (!q)
+ return -ENOMEM;
+
+ fprintf(f, "EnvironmentFile=%s\n", q);
+
+ r = strv_consume(&l, TAKE_PTR(buf));
+ if (r < 0)
+ return r;
+ }
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = memstream_finalize(&m, &joined, NULL);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->environment_files = strv_free(c->environment_files);
+ unit_write_setting(u, flags, name, "EnvironmentFile=");
+ } else {
+ r = strv_extend_strv(&c->environment_files, l, true);
+ if (r < 0)
+ return r;
+
+ unit_write_setting(u, flags, name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "PassEnvironment")) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_is_valid(l))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PassEnvironment= block.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->pass_environment = strv_free(c->pass_environment);
+ unit_write_setting(u, flags, name, "PassEnvironment=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&c->pass_environment, l, true);
+ if (r < 0)
+ return r;
+
+ /* We write just the new settings out to file, with unresolved specifiers. */
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "PassEnvironment=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "ReadWriteDirectories", "ReadOnlyDirectories", "InaccessibleDirectories",
+ "ReadWritePaths", "ReadOnlyPaths", "InaccessiblePaths", "ExecPaths", "NoExecPaths",
+ "ExtensionDirectories")) {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***dirs;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ char *i = *p;
+ size_t offset;
+
+ offset = i[0] == '-';
+ offset += i[offset] == '+';
+ if (!path_is_absolute(i + offset))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+ path_simplify(i + offset);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (STR_IN_SET(name, "ReadWriteDirectories", "ReadWritePaths"))
+ dirs = &c->read_write_paths;
+ else if (STR_IN_SET(name, "ReadOnlyDirectories", "ReadOnlyPaths"))
+ dirs = &c->read_only_paths;
+ else if (streq(name, "ExecPaths"))
+ dirs = &c->exec_paths;
+ else if (streq(name, "NoExecPaths"))
+ dirs = &c->no_exec_paths;
+ else if (streq(name, "ExtensionDirectories"))
+ dirs = &c->extension_directories;
+ else /* "InaccessiblePaths" */
+ dirs = &c->inaccessible_paths;
+
+ if (strv_isempty(l)) {
+ *dirs = strv_free(*dirs);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ r = strv_extend_strv(dirs, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "ExecSearchPath")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!path_is_absolute(*p) || !path_is_normalized(*p) || strchr(*p, ':'))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ c->exec_search_path = strv_free(c->exec_search_path);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ r = strv_extend_strv(&c->exec_search_path, l, true);
+ if (r < 0)
+ return -ENOMEM;
+ joined = strv_join(c->exec_search_path, ":");
+ if (!joined)
+ return log_oom();
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ExecSearchPath=%s", joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "RuntimeDirectory", "StateDirectory", "CacheDirectory", "LogsDirectory", "ConfigurationDirectory")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ if (!path_is_normalized(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is not normalized: %s", name, *p);
+
+ if (path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path is absolute: %s", name, *p);
+
+ if (path_startswith(*p, "private"))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= path can't be 'private': %s", name, *p);
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ ExecDirectoryType i;
+ ExecDirectory *d;
+
+ assert_se((i = exec_directory_type_from_string(name)) >= 0);
+ d = c->directories + i;
+
+ if (strv_isempty(l)) {
+ exec_directory_done(d);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ STRV_FOREACH(source, l) {
+ r = exec_directory_add(d, *source, NULL);
+ if (r < 0)
+ return log_oom();
+ }
+ exec_directory_sort(d);
+
+ joined = unit_concat_strv(l, UNIT_ESCAPE_SPECIFIERS);
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "AppArmorProfile", "SmackProcessLabel")) {
+ int ignore;
+ const char *s;
+
+ r = sd_bus_message_read(message, "(bs)", &ignore, &s);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ char **p;
+ bool *b;
+
+ if (streq(name, "AppArmorProfile")) {
+ p = &c->apparmor_profile;
+ b = &c->apparmor_profile_ignore;
+ } else { /* "SmackProcessLabel" */
+ p = &c->smack_process_label;
+ b = &c->smack_process_label_ignore;
+ }
+
+ if (isempty(s)) {
+ *p = mfree(*p);
+ *b = false;
+ } else {
+ if (free_and_strdup(p, s) < 0)
+ return -ENOMEM;
+ *b = ignore;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s%s", name, ignore ? "-" : "", strempty(s));
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "BindPaths", "BindReadOnlyPaths")) {
+ char *source, *destination;
+ int ignore_enoent;
+ uint64_t mount_flags;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ssbt)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ssbt)", &source, &destination, &ignore_enoent, &mount_flags)) > 0) {
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+ if (!IN_SET(mount_flags, 0, MS_REC))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown mount flags.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = source,
+ .destination = destination,
+ .read_only = !!strstr(name, "ReadOnly"),
+ .recursive = !!(mount_flags & MS_REC),
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s%s:%s:%s",
+ name,
+ ignore_enoent ? "-" : "",
+ source,
+ destination,
+ (mount_flags & MS_REC) ? "rbind" : "norbind");
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (empty) {
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if (streq(name, "TemporaryFileSystem")) {
+ const char *path, *options;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &path, &options)) > 0) {
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Mount point %s is not absolute.", path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s:%s",
+ name,
+ path,
+ options);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (empty) {
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ }
+
+ return 1;
+
+ } else if ((suffix = startswith(name, "Limit"))) {
+ const char *soft = NULL;
+ int ri;
+
+ ri = rlimit_from_string(suffix);
+ if (ri < 0) {
+ soft = endswith(suffix, "Soft");
+ if (soft) {
+ const char *n;
+
+ n = strndupa_safe(suffix, soft - suffix);
+ ri = rlimit_from_string(n);
+ if (ri >= 0)
+ name = strjoina("Limit", n);
+ }
+ }
+
+ if (ri >= 0) {
+ uint64_t rl;
+ rlim_t x;
+
+ r = sd_bus_message_read(message, "t", &rl);
+ if (r < 0)
+ return r;
+
+ if (rl == UINT64_MAX)
+ x = RLIM_INFINITY;
+ else {
+ x = (rlim_t) rl;
+
+ if ((uint64_t) x != rl)
+ return -ERANGE;
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *f = NULL;
+ struct rlimit nl;
+
+ if (c->rlimit[ri]) {
+ nl = *c->rlimit[ri];
+
+ if (soft)
+ nl.rlim_cur = x;
+ else
+ nl.rlim_max = x;
+ } else
+ /* When the resource limit is not initialized yet, then assign the value to both fields */
+ nl = (struct rlimit) {
+ .rlim_cur = x,
+ .rlim_max = x,
+ };
+
+ r = rlimit_format(&nl, &f);
+ if (r < 0)
+ return r;
+
+ if (c->rlimit[ri])
+ *c->rlimit[ri] = nl;
+ else {
+ c->rlimit[ri] = newdup(struct rlimit, &nl, 1);
+ if (!c->rlimit[ri])
+ return -ENOMEM;
+ }
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, f);
+ }
+
+ return 1;
+ }
+
+ } else if (streq(name, "MountImages")) {
+ _cleanup_free_ char *format_str = NULL;
+ MountImage *mount_images = NULL;
+ size_t n_mount_images = 0;
+ char *source, *destination;
+ int permissive;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ssba(ss))");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source_escaped = NULL, *destination_escaped = NULL;
+ char *tuple;
+
+ r = sd_bus_message_enter_container(message, 'r', "ssba(ss)");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssb", &source, &destination, &permissive);
+ if (r <= 0)
+ break;
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+ if (!path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not absolute.", destination);
+ if (!path_is_normalized(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = shell_escape(source, ":");
+ if (!source_escaped)
+ return -ENOMEM;
+ destination_escaped = shell_escape(destination, ":");
+ if (!destination_escaped)
+ return -ENOMEM;
+
+ tuple = strjoin(format_str,
+ format_str ? " " : "",
+ permissive ? "-" : "",
+ source_escaped,
+ ":",
+ destination_escaped);
+ if (!tuple)
+ return -ENOMEM;
+ free_and_replace(format_str, tuple);
+
+ r = bus_read_mount_options(message, error, &options, &format_str, ":");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = mount_image_add(&mount_images, &n_mount_images,
+ &(MountImage) {
+ .source = source,
+ .destination = destination,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_DISCRETE,
+ });
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n_mount_images == 0) {
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ for (size_t i = 0; i < n_mount_images; ++i) {
+ r = mount_image_add(&c->mount_images, &c->n_mount_images, &mount_images[i]);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+ name,
+ "%s=%s",
+ name,
+ format_str);
+ }
+ }
+
+ mount_images = mount_image_free_many(mount_images, &n_mount_images);
+
+ return 1;
+ } else if (streq(name, "ExtensionImages")) {
+ _cleanup_free_ char *format_str = NULL;
+ MountImage *extension_images = NULL;
+ size_t n_extension_images = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sba(ss))");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source_escaped = NULL;
+ char *source, *tuple;
+ int permissive;
+
+ r = sd_bus_message_enter_container(message, 'r', "sba(ss)");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sb", &source, &permissive);
+ if (r <= 0)
+ break;
+
+ if (!path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = shell_escape(source, ":");
+ if (!source_escaped)
+ return -ENOMEM;
+
+ tuple = strjoin(format_str,
+ format_str ? " " : "",
+ permissive ? "-" : "",
+ source_escaped);
+ if (!tuple)
+ return -ENOMEM;
+ free_and_replace(format_str, tuple);
+
+ r = bus_read_mount_options(message, error, &options, &format_str, ":");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = mount_image_add(&extension_images, &n_extension_images,
+ &(MountImage) {
+ .source = source,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_EXTENSION,
+ });
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (n_extension_images == 0) {
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ for (size_t i = 0; i < n_extension_images; ++i) {
+ r = mount_image_add(&c->extension_images, &c->n_extension_images, &extension_images[i]);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_C|UNIT_ESCAPE_SPECIFIERS,
+ name,
+ "%s=%s",
+ name,
+ format_str);
+ }
+ }
+
+ extension_images = mount_image_free_many(extension_images, &n_extension_images);
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "StateDirectorySymlink", "RuntimeDirectorySymlink", "CacheDirectorySymlink", "LogsDirectorySymlink")) {
+ char *source, *destination;
+ ExecDirectory *directory;
+ uint64_t symlink_flags; /* No flags for now, reserved for future uses. */
+ ExecDirectoryType i;
+
+ assert_se((i = exec_directory_type_symlink_from_string(name)) >= 0);
+ directory = c->directories + i;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(sst)", &source, &destination, &symlink_flags)) > 0) {
+ if (!path_is_valid(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not valid.", source);
+ if (path_is_absolute(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is absolute.", source);
+ if (!path_is_normalized(source))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Source path %s is not normalized.", source);
+ if (!path_is_valid(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not valid.", destination);
+ if (path_is_absolute(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is absolute.", destination);
+ if (!path_is_normalized(destination))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path %s is not normalized.", destination);
+ if (symlink_flags != 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *destination_escaped = NULL, *source_escaped = NULL;
+
+ r = exec_directory_add(directory, source, destination);
+ if (r < 0)
+ return r;
+
+ /* Need to store them in the unit with the escapes, so that they can be parsed again */
+ source_escaped = xescape(source, ":");
+ destination_escaped = xescape(destination, ":");
+ if (!source_escaped || !destination_escaped)
+ return -ENOMEM;
+
+ unit_write_settingf(
+ u, flags|UNIT_ESCAPE_SPECIFIERS, exec_directory_type_to_string(i),
+ "%s=%s:%s",
+ exec_directory_type_to_string(i),
+ source_escaped,
+ destination_escaped);
+ }
+ }
+ if (r < 0)
+ return r;
+
+ exec_directory_sort(directory);
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ } else if (STR_IN_SET(name, "RootImagePolicy", "MountImagePolicy", "ExtensionImagePolicy")) {
+ _cleanup_(image_policy_freep) ImagePolicy *p = NULL;
+ const char *s;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ r = image_policy_from_string(s, &p);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to parse image policy string: %s", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *t = NULL;
+ ImagePolicy **pp =
+ streq(name, "RootImagePolicy") ? &c->root_image_policy :
+ streq(name, "MountImagePolicy") ? &c->mount_image_policy :
+ &c->extension_image_policy;
+
+ r = image_policy_to_string(p, /* simplify= */ true, &t);
+ if (r < 0)
+ return r;
+
+ image_policy_free(*pp);
+ *pp = TAKE_PTR(p);
+
+ unit_write_settingf(
+ u, flags, name,
+ "%s=%s",
+ name,
+ t); /* no escaping necessary */
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h
new file mode 100644
index 0000000..5926bdb
--- /dev/null
+++ b/src/core/dbus-execute.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "execute.h"
+
+#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
+ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
+ SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
+ SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
+ SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)
+
+#define BUS_EXEC_COMMAND_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command, offset, flags)
+
+#define BUS_EXEC_COMMAND_LIST_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasbttttuii)", bus_property_get_exec_command_list, offset, flags)
+
+#define BUS_EXEC_EX_COMMAND_LIST_VTABLE(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "a(sasasttttuii)", bus_property_get_exec_ex_command_list, offset, flags)
+
+extern const sd_bus_vtable bus_exec_vtable[];
+
+int bus_property_get_exec_output(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_ex_command_list(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_exec_preserve_mode(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+
+int bus_exec_context_set_transient_property(Unit *u, ExecContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_exec_command(Unit *u, const char *name, ExecCommand **exec_command, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_exec_preserve_mode(Unit *u, const char *name, ExecPreserveMode *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-job.c b/src/core/dbus-job.c
new file mode 100644
index 0000000..c88d8c2
--- /dev/null
+++ b/src/core/dbus-job.c
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-util.h"
+#include "dbus-job.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "job.h"
+#include "log.h"
+#include "selinux-access.h"
+#include "string-util.h"
+#include "strv.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, job_type, JobType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_state, job_state, JobState);
+
+static int property_get_unit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ p = unit_dbus_path(j->unit);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_message_append(reply, "(so)", j->unit->id, p);
+}
+
+int bus_job_method_cancel(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(j->unit, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ /* Access is granted to the job owner */
+ if (!sd_bus_track_contains(j->bus_track, sd_bus_message_get_sender(message))) {
+
+ /* And for everybody else consult polkit */
+ r = bus_verify_manage_units_async(j->unit->manager, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+ }
+
+ job_finish_and_invalidate(j, JOB_CANCELED, true, false);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_free_ Job **list = NULL;
+ Job *j = userdata;
+ int r, n;
+
+ if (strstr(sd_bus_message_get_member(message), "After"))
+ n = job_get_after(j, &list);
+ else
+ n = job_get_before(j, &list);
+ if (n < 0)
+ return n;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+ if (r < 0)
+ return r;
+
+ for (int i = 0; i < n; i ++) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+ job_path = job_dbus_path(list[i]);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(list[i]->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "(usssoo)",
+ list[i]->id,
+ list[i]->unit->id,
+ job_type_to_string(list[i]->type),
+ job_state_to_string(list[i]->state),
+ job_path,
+ unit_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+const sd_bus_vtable bus_job_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_METHOD("Cancel", NULL, NULL, bus_job_method_cancel, SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetAfter",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ bus_job_method_get_waiting_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetBefore",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ bus_job_method_get_waiting_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_PROPERTY("Id", "u", NULL, offsetof(Job, id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Unit", "(so)", property_get_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobType", "s", property_get_type, offsetof(Job, type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("State", "s", property_get_state, offsetof(Job, state), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Job, activation_details), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_job_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = manager_get_job_from_dbus_path(m, path, &j);
+ if (r < 0)
+ return 0;
+
+ *found = j;
+ return 1;
+}
+
+static int bus_job_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = userdata;
+ unsigned k = 0;
+ Job *j;
+
+ l = new0(char*, hashmap_size(m->jobs)+1);
+ if (!l)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(j, m->jobs) {
+ l[k] = job_dbus_path(j);
+ if (!l[k])
+ return -ENOMEM;
+
+ k++;
+ }
+
+ assert(hashmap_size(m->jobs) == k);
+
+ *nodes = TAKE_PTR(l);
+
+ return k;
+}
+
+const BusObjectImplementation job_object = {
+ "/org/freedesktop/systemd1/job",
+ "org.freedesktop.systemd1.Job",
+ .fallback_vtables = BUS_FALLBACK_VTABLES({bus_job_vtable, bus_job_find}),
+ .node_enumerator = bus_job_enumerate,
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "JobNew");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "uos", j->id, p, j->unit->id);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_emit_properties_changed(bus, p, "org.freedesktop.systemd1.Job", "State", NULL);
+}
+
+void bus_job_send_change_signal(Job *j) {
+ int r;
+
+ assert(j);
+
+ /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+ bus_unit_send_pending_change_signal(j->unit, true);
+
+ if (j->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = false;
+
+ /* The job might be good to be GC once its pending signals have been sent */
+ job_add_to_gc_queue(j);
+ }
+
+ r = bus_foreach_bus(j->manager, j->bus_track, j->sent_dbus_new_signal ? send_changed_signal : send_new_signal, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send job change signal for %u: %m", j->id);
+
+ j->sent_dbus_new_signal = true;
+}
+
+void bus_job_send_pending_change_signal(Job *j, bool including_new) {
+ assert(j);
+
+ if (!j->in_dbus_queue)
+ return;
+
+ if (!j->sent_dbus_new_signal && !including_new)
+ return;
+
+ if (MANAGER_IS_RELOADING(j->unit->manager))
+ return;
+
+ bus_job_send_change_signal(j);
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Job *j = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = job_dbus_path(j);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "JobRemoved");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "uoss", j->id, p, j->unit->id, job_result_to_string(j->result));
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+void bus_job_send_removed_signal(Job *j) {
+ int r;
+
+ assert(j);
+
+ if (!j->sent_dbus_new_signal)
+ bus_job_send_change_signal(j);
+
+ /* Make sure that any change signal on the unit is reflected before we send out the change signal on the job */
+ bus_unit_send_pending_change_signal(j->unit, true);
+
+ r = bus_foreach_bus(j->manager, j->bus_track, send_removed_signal, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send job remove signal for %u: %m", j->id);
+}
+
+static int bus_job_track_handler(sd_bus_track *t, void *userdata) {
+ Job *j = ASSERT_PTR(userdata);
+
+ assert(t);
+
+ j->bus_track = sd_bus_track_unref(j->bus_track); /* make sure we aren't called again */
+
+ /* Last client dropped off the bus, maybe we should GC this now? */
+ job_add_to_gc_queue(j);
+ return 0;
+}
+
+static int bus_job_allocate_bus_track(Job *j) {
+
+ assert(j);
+
+ if (j->bus_track)
+ return 0;
+
+ return sd_bus_track_new(j->unit->manager->api_bus, &j->bus_track, bus_job_track_handler, j);
+}
+
+int bus_job_coldplug_bus_track(Job *j) {
+ int r;
+ _cleanup_strv_free_ char **deserialized_clients = NULL;
+
+ assert(j);
+
+ deserialized_clients = TAKE_PTR(j->deserialized_clients);
+
+ if (strv_isempty(deserialized_clients))
+ return 0;
+
+ if (!j->manager->api_bus)
+ return 0;
+
+ r = bus_job_allocate_bus_track(j);
+ if (r < 0)
+ return r;
+
+ return bus_track_add_name_many(j->bus_track, deserialized_clients);
+}
+
+int bus_job_track_sender(Job *j, sd_bus_message *m) {
+ int r;
+
+ assert(j);
+ assert(m);
+
+ if (sd_bus_message_get_bus(m) != j->unit->manager->api_bus) {
+ j->ref_by_private_bus = true;
+ return 0;
+ }
+
+ r = bus_job_allocate_bus_track(j);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_sender(j->bus_track, m);
+}
diff --git a/src/core/dbus-job.h b/src/core/dbus-job.h
new file mode 100644
index 0000000..6f00581
--- /dev/null
+++ b/src/core/dbus-job.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+#include "bus-object.h"
+
+extern const sd_bus_vtable bus_job_vtable[];
+extern const BusObjectImplementation job_object;
+
+int bus_job_method_cancel(sd_bus_message *message, void *job, sd_bus_error *error);
+int bus_job_method_get_waiting_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+void bus_job_send_change_signal(Job *j);
+void bus_job_send_pending_change_signal(Job *j, bool including_new);
+void bus_job_send_removed_signal(Job *j);
+
+int bus_job_coldplug_bus_track(Job *j);
+int bus_job_track_sender(Job *j, sd_bus_message *m);
diff --git a/src/core/dbus-kill.c b/src/core/dbus-kill.c
new file mode 100644
index 0000000..19e439f
--- /dev/null
+++ b/src/core/dbus-kill.c
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-kill.h"
+#include "dbus-util.h"
+#include "kill.h"
+#include "signal-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_kill_mode, kill_mode, KillMode);
+
+static int property_get_restart_kill_signal(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ KillContext *c = ASSERT_PTR(userdata);
+ int s;
+
+ s = restart_kill_signal(c);
+ return sd_bus_message_append_basic(reply, 'i', &s);
+}
+
+const sd_bus_vtable bus_kill_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("KillMode", "s", property_get_kill_mode, offsetof(KillContext, kill_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KillSignal", "i", bus_property_get_int, offsetof(KillContext, kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartKillSignal", "i", property_get_restart_kill_signal, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FinalKillSignal", "i", bus_property_get_int, offsetof(KillContext, final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendSIGKILL", "b", bus_property_get_bool, offsetof(KillContext, send_sigkill), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendSIGHUP", "b", bus_property_get_bool, offsetof(KillContext, send_sighup), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogSignal", "i", bus_property_get_int, offsetof(KillContext, watchdog_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(kill_mode, KillMode, kill_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(restart_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(final_kill_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(watchdog_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+
+int bus_kill_context_set_transient_property(
+ Unit *u,
+ KillContext *c,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ assert(u);
+ assert(c);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "KillMode"))
+ return bus_set_transient_kill_mode(u, name, &c->kill_mode, message, flags, error);
+
+ if (streq(name, "SendSIGHUP"))
+ return bus_set_transient_bool(u, name, &c->send_sighup, message, flags, error);
+
+ if (streq(name, "SendSIGKILL"))
+ return bus_set_transient_bool(u, name, &c->send_sigkill, message, flags, error);
+
+ if (streq(name, "KillSignal"))
+ return bus_set_transient_kill_signal(u, name, &c->kill_signal, message, flags, error);
+
+ if (streq(name, "RestartKillSignal"))
+ return bus_set_transient_restart_kill_signal(u, name, &c->restart_kill_signal, message, flags, error);
+
+ if (streq(name, "FinalKillSignal"))
+ return bus_set_transient_final_kill_signal(u, name, &c->final_kill_signal, message, flags, error);
+
+ if (streq(name, "WatchdogSignal"))
+ return bus_set_transient_watchdog_signal(u, name, &c->watchdog_signal, message, flags, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-kill.h b/src/core/dbus-kill.h
new file mode 100644
index 0000000..5a90287
--- /dev/null
+++ b/src/core/dbus-kill.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "kill.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_kill_vtable[];
+
+int bus_kill_context_set_transient_property(Unit *u, KillContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c
new file mode 100644
index 0000000..745f5cc
--- /dev/null
+++ b/src/core/dbus-manager.c
@@ -0,0 +1,3628 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/prctl.h>
+#include <sys/statvfs.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "build.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "chase.h"
+#include "confidential-virt.h"
+#include "data-fd-util.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "initrd-util.h"
+#include "install.h"
+#include "log.h"
+#include "manager-dump.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "user-util.h"
+#include "version.h"
+#include "virt.h"
+#include "watchdog.h"
+
+/* Require 16MiB free in /run/systemd for reloading/reexecing. After all we need to serialize our state
+ * there, and if we can't we'll fail badly. */
+#define RELOAD_DISK_SPACE_MIN (UINT64_C(16) * UINT64_C(1024) * UINT64_C(1024))
+
+static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) {
+ return (runtime ? UNIT_FILE_RUNTIME : 0) |
+ (force ? UNIT_FILE_FORCE : 0);
+}
+
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy);
+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_emergency_action, emergency_action, EmergencyAction);
+
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", systemd_features);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture()));
+static BUS_DEFINE_PROPERTY_GET2(property_get_system_state, "s", Manager, manager_state, manager_state_to_string);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_timer_slack_nsec, "t", (uint64_t) prctl(PR_GET_TIMERSLACK));
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_hashmap_size, "u", Hashmap *, hashmap_size);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_set_size, "u", Set *, set_size);
+static BUS_DEFINE_PROPERTY_GET(property_get_default_timeout_abort_usec, "t", Manager, manager_default_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_device, "s", watchdog_get_device());
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_realtime, "t", watchdog_get_last_ping(CLOCK_REALTIME));
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_watchdog_last_ping_monotonic, "t", watchdog_get_last_ping(CLOCK_MONOTONIC));
+static BUS_DEFINE_PROPERTY_GET(property_get_progress, "d", Manager, manager_get_progress);
+
+static int property_get_virtualization(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Virtualization v;
+
+ assert(bus);
+ assert(reply);
+
+ v = detect_virtualization();
+
+ /* Make sure to return the empty string when we detect no virtualization, as that is the API.
+ *
+ * https://github.com/systemd/systemd/issues/1423
+ */
+
+ return sd_bus_message_append(
+ reply, "s",
+ v == VIRTUALIZATION_NONE ? NULL : virtualization_to_string(v));
+}
+
+static int property_get_confidential_virtualization(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ConfidentialVirtualization v;
+
+ assert(bus);
+ assert(reply);
+
+ v = detect_confidential_virtualization();
+
+ return sd_bus_message_append(
+ reply, "s",
+ v <= 0 ? NULL : confidential_virtualization_to_string(v));
+}
+
+static int property_get_tainted(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *s = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ s = manager_taint_string(m);
+ if (!s)
+ return log_oom();
+
+ return sd_bus_message_append(reply, "s", s);
+}
+
+static int property_set_log_target(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = userdata;
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (isempty(t))
+ manager_restore_original_log_target(m);
+ else {
+ LogTarget target;
+
+ target = log_target_from_string(t);
+ if (target < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t);
+
+ manager_override_log_target(m, target);
+ }
+
+ return 0;
+}
+
+static int property_set_log_level(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = userdata;
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (isempty(t))
+ manager_restore_original_log_level(m);
+ else {
+ int level;
+
+ level = log_level_from_string(t);
+ if (level < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t);
+
+ manager_override_log_level(m, level);
+ }
+
+ return 0;
+}
+
+static int property_get_environment(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = manager_get_effective_environment(m, &l);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_append_strv(reply, l);
+}
+
+static int property_get_show_status(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "b", manager_get_show_status_on(m));
+}
+
+static int property_get_runtime_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_RUNTIME));
+}
+
+static int property_get_pretimeout_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_PRETIMEOUT));
+}
+
+static int property_get_pretimeout_watchdog_governor(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "s", m->watchdog_pretimeout_governor);
+}
+
+static int property_get_reboot_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_REBOOT));
+}
+
+static int property_get_kexec_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", manager_get_watchdog(m, WATCHDOG_KEXEC));
+}
+
+static int property_set_watchdog(Manager *m, WatchdogType type, sd_bus_message *value) {
+ usec_t timeout;
+ int r;
+
+ assert(m);
+ assert(value);
+
+ assert_cc(sizeof(usec_t) == sizeof(uint64_t));
+
+ r = sd_bus_message_read(value, "t", &timeout);
+ if (r < 0)
+ return r;
+
+ manager_override_watchdog(m, type, timeout);
+ return 0;
+}
+
+static int property_set_runtime_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_RUNTIME, value);
+}
+
+static int property_set_pretimeout_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_PRETIMEOUT, value);
+}
+
+static int property_set_pretimeout_watchdog_governor(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ char *governor;
+ int r;
+
+ r = sd_bus_message_read(value, "s", &governor);
+ if (r < 0)
+ return r;
+ if (!string_is_safe(governor))
+ return -EINVAL;
+
+ return manager_override_watchdog_pretimeout_governor(m, governor);
+}
+
+static int property_set_reboot_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ return property_set_watchdog(userdata, WATCHDOG_REBOOT, value);
+}
+
+static int property_set_kexec_watchdog(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _unused_ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(value);
+
+ return property_set_watchdog(userdata, WATCHDOG_KEXEC, value);
+}
+
+static int property_get_oom_score_adjust(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ int r, n;
+
+ assert(bus);
+ assert(reply);
+
+ if (m->defaults.oom_score_adjust_set)
+ n = m->defaults.oom_score_adjust;
+ else {
+ n = 0;
+ r = get_oom_score_adjust(&n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read current OOM score adjustment value, ignoring: %m");
+ }
+
+ return sd_bus_message_append(reply, "i", n);
+}
+
+static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(message);
+ assert(ret_unit);
+
+ /* More or less a wrapper around manager_get_unit() that generates nice errors and has one trick up
+ * its sleeve: if the name is specified empty we use the client's unit. */
+
+ if (isempty(name)) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
+ } else {
+ u = manager_get_unit(m, name);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", name);
+ }
+
+ *ret_unit = u;
+ return 0;
+}
+
+static int bus_load_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
+ assert(m);
+ assert(message);
+ assert(ret_unit);
+
+ /* Pretty much the same as bus_get_unit_by_name(), but we also load the unit if necessary. */
+
+ if (isempty(name))
+ return bus_get_unit_by_name(m, message, name, ret_unit, error);
+
+ return manager_load_unit(m, name, NULL, error, ret_unit);
+}
+
+static int reply_unit_path(Unit *u, sd_bus_message *message, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(u);
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ path = unit_dbus_path(u);
+ if (!path)
+ return log_oom();
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ pid_t pid;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(pid_t) == sizeof(uint32_t));
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "u", &pid);
+ if (r < 0)
+ return r;
+ if (pid < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid PID " PID_FMT, pid);
+
+ if (pid == 0) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ }
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pid);
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_id128_t id;
+ const void *a;
+ Unit *u;
+ size_t sz;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read_array(message, 'y', &a, &sz);
+ if (r < 0)
+ return r;
+ if (sz == 0)
+ id = SD_ID128_NULL;
+ else if (sz == 16)
+ memcpy(&id, a, sz);
+ else
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID");
+
+ if (sd_id128_is_null(id)) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Client " PID_FMT " not member of any unit.", pid);
+ } else {
+ u = hashmap_get(m->units_by_invocation_id, &id);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id));
+ }
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ /* So here's a special trick: the bus path we return actually references the unit by its invocation
+ * ID instead of the unit name. This means it stays valid only as long as the invocation ID stays the
+ * same. */
+ path = unit_dbus_path_invocation_id(u);
+ if (!path)
+ return -ENOMEM;
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_get_unit_by_control_group(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = userdata;
+ const char *cgroup;
+ Unit *u;
+ int r;
+
+ r = sd_bus_message_read(message, "s", &cgroup);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_cgroup(m, cgroup);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Control group '%s' is not valid or not managed by this instance",
+ cgroup);
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_get_unit_by_pidfd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ _cleanup_free_ char *path = NULL;
+ int r, pidfd;
+ Unit *u;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "h", &pidfd);
+ if (r < 0)
+ return r;
+
+ r = pidref_set_pidfd(&pidref, pidfd);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m");
+
+ u = manager_get_unit_by_pidref(m, &pidref);
+ if (!u)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_PID, "PID "PID_FMT" does not belong to any loaded unit.", pidref.pid);
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ path = unit_dbus_path(u);
+ if (!path)
+ return log_oom();
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "os", path, u->id);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(reply, 'y', u->invocation_id.bytes, sizeof(u->invocation_id.bytes));
+ if (r < 0)
+ return r;
+
+ /* Double-check that the process is still alive and that the PID did not change before returning the
+ * answer. */
+ r = pidref_verify(&pidref);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error,
+ BUS_ERROR_NO_SUCH_PROCESS,
+ "The PIDFD's PID "PID_FMT" changed during the lookup operation.",
+ pidref.pid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to get PID from PIDFD: %m");
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_load_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ return reply_unit_path(u, message, error);
+}
+
+static int method_start_unit_generic(sd_bus_message *message, Manager *m, JobType job_type, bool reload_if_possible, sd_bus_error *error) {
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = manager_load_unit(m, name, NULL, error, &u);
+ if (r < 0)
+ return r;
+
+ return bus_unit_method_start_generic(message, u, job_type, reload_if_possible, error);
+}
+
+static int method_start_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_START, /* reload_if_possible = */ false, error);
+}
+
+static int method_stop_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_STOP, /* reload_if_possible = */ false, error);
+}
+
+static int method_reload_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RELOAD, /* reload_if_possible = */ false, error);
+}
+
+static int method_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ false, error);
+}
+
+static int method_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ false, error);
+}
+
+static int method_reload_or_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_RESTART, /* reload_if_possible = */ true, error);
+}
+
+static int method_reload_or_try_restart_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_start_unit_generic(message, userdata, JOB_TRY_RESTART, /* reload_if_possible = */ true, error);
+}
+
+typedef enum GenericUnitOperationFlags {
+ GENERIC_UNIT_LOAD = 1 << 0, /* Load if the unit is not loaded yet */
+ GENERIC_UNIT_VALIDATE_LOADED = 1 << 1, /* Verify unit is properly loaded before forwarding call */
+} GenericUnitOperationFlags;
+
+static int method_generic_unit_operation(
+ sd_bus_message *message,
+ Manager *m,
+ sd_bus_error *error,
+ sd_bus_message_handler_t handler,
+ GenericUnitOperationFlags flags) {
+
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ /* Read the first argument from the command and pass the operation to the specified per-unit
+ * method. */
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ if (!isempty(name) && FLAGS_SET(flags, GENERIC_UNIT_LOAD))
+ r = manager_load_unit(m, name, NULL, error, &u);
+ else
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, GENERIC_UNIT_VALIDATE_LOADED)) {
+ r = bus_unit_validate_load_state(u, error);
+ if (r < 0)
+ return r;
+ }
+
+ return handler(message, u, error);
+}
+
+static int method_enqueue_unit_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* We don't bother with GENERIC_UNIT_VALIDATE_LOADED here, as the job logic validates that anyway */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_enqueue_job, GENERIC_UNIT_LOAD);
+}
+
+static int method_start_unit_replace(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *old_name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &old_name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, old_name, &u, error);
+ if (r < 0)
+ return r;
+ if (!u->job || u->job->type != JOB_START)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "No job queued for unit %s", old_name);
+
+ return method_start_unit_generic(message, m, JOB_START, /* reload_if_possible = */ false, error);
+}
+
+static int method_kill_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* We don't bother with GENERIC_UNIT_LOAD nor GENERIC_UNIT_VALIDATE_LOADED here, as it shouldn't
+ * matter whether a unit is loaded for killing any processes possibly in the unit's cgroup. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_kill, 0);
+}
+
+static int method_clean_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Load the unit if necessary, in order to load it, and insist on the unit being loaded to be
+ * cleaned */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_clean, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_freeze_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_freeze, 0);
+}
+
+static int method_thaw_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_thaw, 0);
+}
+
+static int method_reset_failed_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't load the unit (because unloaded units can't be in failed state), and don't insist on the
+ * unit to be loaded properly (since a failed unit might have its unit file disappeared) */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_reset_failed, 0);
+}
+
+static int method_set_unit_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only change properties on fully loaded units, and load them in order to set properties */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_set_properties, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_bind_mount_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only add mounts on fully loaded units */
+ return method_generic_unit_operation(message, userdata, error, bus_service_method_bind_mount, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_mount_image_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only add mounts on fully loaded units */
+ return method_generic_unit_operation(message, userdata, error, bus_service_method_mount_image, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_ref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Only allow reffing of fully loaded units, and make sure reffing a unit loads it. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_ref, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int method_unref_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Dropping a ref OTOH should not require the unit to still be loaded. And since a reffed unit is a
+ * loaded unit there's no need to load the unit for unreffing it. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_unref, 0);
+}
+
+static int reply_unit_info(sd_bus_message *reply, Unit *u) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+ Unit *following;
+
+ following = unit_following(u);
+
+ unit_path = unit_dbus_path(u);
+ if (!unit_path)
+ return -ENOMEM;
+
+ if (u->job) {
+ job_path = job_dbus_path(u->job);
+ if (!job_path)
+ return -ENOMEM;
+ }
+
+ return sd_bus_message_append(
+ reply, "(ssssssouso)",
+ u->id,
+ unit_description(u),
+ unit_load_state_to_string(u->load_state),
+ unit_active_state_to_string(unit_active_state(u)),
+ unit_sub_state_to_string(u),
+ following ? following->id : "",
+ unit_path,
+ u->job ? u->job->id : 0,
+ u->job ? job_type_to_string(u->job->type) : "",
+ empty_to_root(job_path));
+}
+
+static int method_list_units_by_names(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+ _cleanup_strv_free_ char **units = NULL;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &units);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(unit, units) {
+ Unit *u;
+
+ if (!unit_name_is_valid(*unit, UNIT_NAME_ANY))
+ continue;
+
+ r = bus_load_unit_by_name(m, message, *unit, &u, error);
+ if (r < 0)
+ return r;
+
+ r = reply_unit_info(reply, u);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't load a unit (since it won't have any processes if it's not loaded), but don't insist on the
+ * unit being loaded (because even improperly loaded units might still have processes around */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_get_processes, 0);
+}
+
+static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ /* Don't allow attaching new processes to units that aren't loaded. Don't bother with loading a unit
+ * for this purpose though, as an unloaded unit is a stopped unit, and we don't allow attaching
+ * processes to stopped units anyway. */
+ return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
+}
+
+static int transient_unit_from_message(
+ Manager *m,
+ sd_bus_message *message,
+ const char *name,
+ Unit **unit,
+ sd_bus_error *error) {
+
+ UnitType t;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(message);
+ assert(name);
+
+ t = unit_name_to_type(name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid unit name or type.");
+
+ if (!unit_vtable[t]->can_transient)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit type %s does not support transient units.",
+ unit_type_to_string(t));
+
+ r = manager_load_unit(m, name, NULL, error, &u);
+ if (r < 0)
+ return r;
+
+ if (!unit_is_pristine(u))
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "Unit %s was already loaded or has a fragment file.", name);
+
+ /* OK, the unit failed to load and is unreferenced, now let's
+ * fill in the transient data instead */
+ r = unit_make_transient(u);
+ if (r < 0)
+ return r;
+
+ /* Set our properties */
+ r = bus_unit_set_properties(u, message, UNIT_RUNTIME, false, error);
+ if (r < 0)
+ return r;
+
+ /* If the client asked for it, automatically add a reference to this unit. */
+ if (u->bus_track_add) {
+ r = bus_unit_track_add_sender(u, message);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch sender: %m");
+ }
+
+ /* Now load the missing bits of the unit we just created */
+ unit_add_to_load_queue(u);
+ manager_dispatch_load_queue(m);
+
+ *unit = u;
+
+ return 0;
+}
+
+static int transient_aux_units_from_message(
+ Manager *m,
+ sd_bus_message *message,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(m);
+ assert(message);
+
+ r = sd_bus_message_enter_container(message, 'a', "(sa(sv))");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_enter_container(message, 'r', "sa(sv)")) > 0) {
+ const char *name = NULL;
+ Unit *u;
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = transient_unit_from_message(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int method_start_transient_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const char *name, *smode;
+ Manager *m = ASSERT_PTR(userdata);
+ JobMode mode;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ss", &name, &smode);
+ if (r < 0)
+ return r;
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s is invalid.", smode);
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = transient_unit_from_message(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ r = transient_aux_units_from_message(m, message, error);
+ if (r < 0)
+ return r;
+
+ /* Finally, start it */
+ return bus_unit_queue_job(message, u, JOB_START, mode, 0, error);
+}
+
+static int method_get_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *path = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ r = mac_selinux_unit_access_check(j->unit, message, "status", error);
+ if (r < 0)
+ return r;
+
+ path = job_dbus_path(j);
+ if (!path)
+ return -ENOMEM;
+
+ return sd_bus_reply_method_return(message, "o", path);
+}
+
+static int method_cancel_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ return bus_job_method_cancel(message, j, error);
+}
+
+static int method_clear_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ manager_clear_jobs(m);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ manager_reset_failed(m);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *k;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ssssssouso)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ if (k != u->id)
+ continue;
+
+ if (!strv_isempty(states) &&
+ !strv_contains(states, unit_load_state_to_string(u->load_state)) &&
+ !strv_contains(states, unit_active_state_to_string(unit_active_state(u))) &&
+ !strv_contains(states, unit_sub_state_to_string(u)))
+ continue;
+
+ if (!strv_isempty(patterns) &&
+ !strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+ continue;
+
+ r = reply_unit_info(reply, u);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_list_units(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return list_units_filtered(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_units_filtered(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ return list_units_filtered(message, userdata, error, states, NULL);
+}
+
+static int method_list_units_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return list_units_filtered(message, userdata, error, states, patterns);
+}
+
+static int method_list_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(usssoo)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(j, m->jobs) {
+ _cleanup_free_ char *unit_path = NULL, *job_path = NULL;
+
+ job_path = job_dbus_path(j);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(j->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(
+ reply, "(usssoo)",
+ j->id,
+ j->unit->id,
+ job_type_to_string(j->type),
+ job_state_to_string(j->state),
+ job_path,
+ unit_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_subscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_get_bus(message) == m->api_bus) {
+
+ /* Note that direct bus connection subscribe by
+ * default, we only track peers on the API bus here */
+
+ if (!m->subscribed) {
+ r = sd_bus_track_new(sd_bus_message_get_bus(message), &m->subscribed, NULL, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_track_add_sender(m->subscribed, message);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return sd_bus_error_set(error, BUS_ERROR_ALREADY_SUBSCRIBED, "Client is already subscribed.");
+ }
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unsubscribe(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_get_bus(message) == m->api_bus) {
+ r = sd_bus_track_remove_sender(m->subscribed, message);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return sd_bus_error_set(error, BUS_ERROR_NOT_SUBSCRIBED, "Client is not subscribed.");
+ }
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int dump_impl(
+ sd_bus_message *message,
+ void *userdata,
+ sd_bus_error *error,
+ char **patterns,
+ int (*reply)(sd_bus_message *, char *)) {
+
+ _cleanup_free_ char *dump = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* 'status' access is the bare minimum always needed for this, as the policy might straight out
+ * forbid a client from querying any information from systemd, regardless of any rate limiting. */
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ /* Rate limit reached? Check if the caller is privileged/allowed by policy to bypass this. We
+ * check the rate limit first to avoid the expensive roundtrip to polkit when not needed. */
+ if (!ratelimit_below(&m->dump_ratelimit)) {
+ /* We need a way for SELinux to constrain the operation when the rate limit is active, even
+ * if polkit would allow it, but we cannot easily add new named permissions, so we need to
+ * use an existing one. Reload/reexec are also slow but non-destructive/modifying
+ * operations, and can cause PID1 to stall. So it seems similar enough in terms of security
+ * considerations and impact, and thus use the same access check for dumps which, given the
+ * large amount of data to fetch, can stall PID1 for quite some time. */
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ goto ratelimited;
+
+ r = bus_verify_bypass_dump_ratelimit_async(m, message, error);
+ if (r < 0)
+ goto ratelimited;
+ if (r == 0)
+ /* No authorization for now, but the async polkit stuff will call us again when it
+ * has it */
+ return 1;
+ }
+
+ r = manager_get_dump_string(m, patterns, &dump);
+ if (r < 0)
+ return r;
+
+ return reply(message, dump);
+
+ratelimited:
+ log_warning("Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+ FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+ return sd_bus_error_setf(error,
+ SD_BUS_ERROR_LIMITS_EXCEEDED,
+ "Dump request rejected due to rate limit on unprivileged callers, blocked for %s.",
+ FORMAT_TIMESPAN(ratelimit_left(&m->dump_ratelimit), USEC_PER_SEC));
+}
+
+static int reply_dump(sd_bus_message *message, char *dump) {
+ return sd_bus_reply_method_return(message, "s", dump);
+}
+
+static int method_dump(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_impl(message, userdata, error, NULL, reply_dump);
+}
+
+static int reply_dump_by_fd(sd_bus_message *message, char *dump) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = acquire_data_fd(dump, strlen(dump), 0);
+ if (fd < 0)
+ return fd;
+
+ return sd_bus_reply_method_return(message, "h", fd);
+}
+
+static int method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_impl(message, userdata, error, NULL, reply_dump_by_fd);
+}
+
+static int dump_units_matching_patterns(
+ sd_bus_message *message,
+ void *userdata,
+ sd_bus_error *error,
+ int (*reply)(sd_bus_message *, char *)) {
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return dump_impl(message, userdata, error, patterns, reply);
+}
+
+static int method_dump_units_matching_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_units_matching_patterns(message, userdata, error, reply_dump);
+}
+
+static int method_dump_units_matching_patterns_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return dump_units_matching_patterns(message, userdata, error, reply_dump_by_fd);
+}
+
+static int method_refuse_snapshot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Support for snapshots has been removed.");
+}
+
+static int get_run_space(uint64_t *ret, sd_bus_error *error) {
+ struct statvfs svfs;
+
+ assert(ret);
+
+ if (statvfs("/run/systemd", &svfs) < 0)
+ return sd_bus_error_set_errnof(error, errno, "Failed to statvfs(/run/systemd): %m");
+
+ *ret = (uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize;
+ return 0;
+}
+
+static int verify_run_space(const char *message, sd_bus_error *error) {
+ uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */
+ int r;
+
+ assert(message);
+
+ r = get_run_space(&available, error);
+ if (r < 0)
+ return r;
+
+ if (available < RELOAD_DISK_SPACE_MIN)
+ return sd_bus_error_setf(error,
+ BUS_ERROR_DISK_FULL,
+ "%s, not enough space available on /run/systemd/. "
+ "Currently, %s are free, but a safety buffer of %s is enforced.",
+ message,
+ FORMAT_BYTES(available),
+ FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+ return 0;
+}
+
+int verify_run_space_and_log(const char *message) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(message);
+
+ r = verify_run_space(message, &error);
+ if (r < 0)
+ return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+ return 0;
+}
+
+static int verify_run_space_permissive(const char *message, sd_bus_error *error) {
+ uint64_t available = 0; /* unnecessary, but used to trick out gcc's incorrect maybe-uninitialized warning */
+ int r;
+
+ assert(message);
+
+ r = get_run_space(&available, error);
+ if (r < 0)
+ return r;
+
+ if (available < RELOAD_DISK_SPACE_MIN)
+ log_warning("Dangerously low amount of free space on /run/systemd/, %s.\n"
+ "Currently, %s are free, but %s are suggested. Proceeding anyway.",
+ message,
+ FORMAT_BYTES(available),
+ FORMAT_BYTES(RELOAD_DISK_SPACE_MIN));
+
+ return 0;
+}
+
+static void log_caller(sd_bus_message *message, Manager *manager, const char *method) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ const char *comm = NULL;
+ Unit *caller;
+ pid_t pid;
+
+ assert(message);
+ assert(manager);
+ assert(method);
+
+ if (sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID|SD_BUS_CREDS_AUGMENT|SD_BUS_CREDS_COMM, &creds) < 0)
+ return;
+
+ /* We need at least the PID, otherwise there's nothing to log, the rest is optional */
+ if (sd_bus_creds_get_pid(creds, &pid) < 0)
+ return;
+
+ (void) sd_bus_creds_get_comm(creds, &comm);
+ caller = manager_get_unit_by_pid(manager, pid);
+
+ log_info("%s requested from client PID " PID_FMT "%s%s%s%s%s%s...",
+ method, pid,
+ comm ? " ('" : "", strempty(comm), comm ? "')" : "",
+ caller ? " (unit " : "", caller ? caller->id : "", caller ? ")" : "");
+}
+
+static int method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = verify_run_space("Refusing to reload", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_reload_daemon_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ /* Write a log message noting the unit or process who requested the Reload() */
+ log_caller(message, m, "Reloading");
+
+ /* Check the rate limit after the authorization succeeds, to avoid denial-of-service issues. */
+ if (!ratelimit_below(&m->reload_ratelimit)) {
+ log_warning("Reloading request rejected due to rate limit.");
+ return sd_bus_error_setf(error,
+ SD_BUS_ERROR_LIMITS_EXCEEDED,
+ "Reload() request rejected due to rate limit.");
+ }
+
+ /* Instead of sending the reply back right away, we just
+ * remember that we need to and then send it after the reload
+ * is finished. That way the caller knows when the reload
+ * finished. */
+
+ assert(!m->pending_reload_message);
+ r = sd_bus_message_new_method_return(message, &m->pending_reload_message);
+ if (r < 0)
+ return r;
+
+ m->objective = MANAGER_RELOAD;
+
+ return 1;
+}
+
+static int method_reexecute(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = verify_run_space("Refusing to reexecute", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_reload_daemon_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ /* Write a log message noting the unit or process who requested the Reexecute() */
+ log_caller(message, m, "Reexecuting");
+
+ /* We don't send a reply back here, the client should
+ * just wait for us disconnecting. */
+
+ m->objective = MANAGER_REEXECUTE;
+ return 1;
+}
+
+static int method_exit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ /* Exit() (in contrast to SetExitCode()) is actually allowed even if
+ * we are running on the host. It will fall back on reboot() in
+ * systemd-shutdown if it cannot do the exit() because it isn't a
+ * container. */
+
+ m->objective = MANAGER_EXIT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Reboot is only supported for system managers.");
+
+ m->objective = MANAGER_REBOOT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_soft_reboot(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *rt = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *root;
+ int r;
+
+ assert(message);
+
+ r = verify_run_space_permissive("soft reboot may fail", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &root);
+ if (r < 0)
+ return r;
+
+ if (!isempty(root)) {
+ if (!path_is_valid(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory '%s' must be a valid path.", root);
+ if (!path_is_absolute(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory path '%s' is not absolute.", root);
+
+ rt = strdup(root);
+ if (!rt)
+ return -ENOMEM;
+ }
+
+ free_and_replace(m->switch_root, rt);
+ m->objective = MANAGER_SOFT_REBOOT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_poweroff(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Powering off is only supported for system managers.");
+
+ m->objective = MANAGER_POWEROFF;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_halt(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "halt", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Halt is only supported for system managers.");
+
+ m->objective = MANAGER_HALT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_kexec(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "KExec is only supported for system managers.");
+
+ m->objective = MANAGER_KEXEC;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_switch_root(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *ri = NULL, *rt = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *root, *init;
+ int r;
+
+ assert(message);
+
+ r = verify_run_space_permissive("root switching may fail", error);
+ if (r < 0)
+ return r;
+
+ r = mac_selinux_access_check(message, "reboot", error);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Root switching is only supported by system manager.");
+
+ r = sd_bus_message_read(message, "ss", &root, &init);
+ if (r < 0)
+ return r;
+
+ if (isempty(root))
+ /* If path is not specified, default to "/sysroot" which is what we generally expect initrds
+ * to use */
+ root = "/sysroot";
+ else {
+ if (!path_is_valid(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory must be a valid path.");
+
+ if (!path_is_absolute(root))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root path '%s' is not absolute.", root);
+
+ r = path_is_root(root);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r,
+ "Failed to check if new root directory '%s' is the same as old root: %m",
+ root);
+ if (r > 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "New root directory cannot be the old root directory.");
+ }
+
+ /* Safety check */
+ if (!in_initrd())
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Not in initrd, refusing switch-root operation.");
+
+ r = path_is_os_tree(root);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r,
+ "Failed to determine whether root path '%s' contains an OS tree: %m",
+ root);
+ if (r == 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Specified switch root path '%s' does not seem to be an OS tree. os-release file is missing.",
+ root);
+
+ if (!isempty(init)) {
+ if (!path_is_valid(init))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Path to init binary '%s' is not a valid path.", init);
+
+ if (!path_is_absolute(init))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Path to init binary '%s' not absolute.", init);
+
+ r = chase_and_access(init, root, CHASE_PREFIX_ROOT, X_OK, NULL);
+ if (r == -EACCES)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Init binary %s is not executable.", init);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r,
+ "Could not resolve init executable %s: %m", init);
+ }
+
+ rt = strdup(root);
+ if (!rt)
+ return -ENOMEM;
+
+ if (!isempty(init)) {
+ ri = strdup(init);
+ if (!ri)
+ return -ENOMEM;
+ }
+
+ free_and_replace(m->switch_root, rt);
+ free_and_replace(m->switch_root_init, ri);
+
+ m->objective = MANAGER_SWITCH_ROOT;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **plus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &plus);
+ if (r < 0)
+ return r;
+ if (!strv_env_is_valid(plus))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid environment assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, NULL, plus);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **minus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &minus);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(minus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment variable names or assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, minus, NULL);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_unset_and_set_environment(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **minus = NULL, **plus = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &minus);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &plus);
+ if (r < 0)
+ return r;
+
+ if (!strv_env_name_or_assignment_is_valid(minus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment variable names or assignments");
+ if (!strv_env_is_valid(plus))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid environment assignments");
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = manager_client_environment_modify(m, minus, plus);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_set_exit_code(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint8_t code;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "exit", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_basic(message, 'y', &code);
+ if (r < 0)
+ return r;
+
+ m->return_value = code;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_lookup_dynamic_user_by_name(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ uid_t uid;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read_basic(message, 's', &name);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "User name invalid: %s", name);
+
+ r = dynamic_user_lookup_name(m, name, &uid);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+ "Dynamic user %s does not exist.", name);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "u", (uint32_t) uid);
+}
+
+static int method_lookup_dynamic_user_by_uid(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *name = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ uid_t uid;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+ r = sd_bus_message_read_basic(message, 'u', &uid);
+ if (r < 0)
+ return r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+ if (!uid_is_valid(uid))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "User ID invalid: " UID_FMT, uid);
+
+ r = dynamic_user_lookup_uid(m, uid, &name);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_DYNAMIC_USER,
+ "Dynamic user ID " UID_FMT " does not exist.", uid);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", name);
+}
+
+static int method_get_dynamic_users(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ DynamicUser *d;
+ int r;
+
+ assert(message);
+
+ assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED,
+ "Dynamic users are only supported in the system instance.");
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(us)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ uid_t uid;
+
+ r = dynamic_user_current(d, &uid);
+ if (r == -EAGAIN) /* not realized yet? */
+ continue;
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED,
+ "Failed to look up a dynamic user.");
+
+ r = sd_bus_message_append(reply, "(us)", uid, d->name);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_enqueue_marked_jobs(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ log_info("Queuing reload/restart jobs for marked units%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "o");
+ if (r < 0)
+ return r;
+
+ Unit *u;
+ char *k;
+ int ret = 0;
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ BusUnitQueueFlags flags;
+ if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RESTART))
+ flags = 0;
+ else if (FLAGS_SET(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD))
+ flags = BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ else
+ continue;
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r >= 0)
+ r = bus_unit_queue_job_one(message, u,
+ JOB_TRY_RESTART, JOB_FAIL, flags,
+ reply, error);
+ if (ERRNO_IS_NEG_RESOURCE(r))
+ return r;
+ if (r < 0) {
+ if (ret >= 0)
+ ret = r;
+ sd_bus_error_free(error);
+ }
+ }
+
+ if (ret < 0)
+ return sd_bus_error_set_errnof(error, ret,
+ "Failed to enqueue some jobs, see logs for details: %m");
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error, char **states, char **patterns) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFileList *item;
+ _cleanup_hashmap_free_ Hashmap *h = NULL;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ h = hashmap_new(&unit_file_list_hash_ops_free);
+ if (!h)
+ return -ENOMEM;
+
+ r = unit_file_get_list(m->runtime_scope, NULL, h, states, patterns);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH(item, h) {
+
+ r = sd_bus_message_append(reply, "(ss)", item->path, unit_file_state_to_string(item->state));
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_list_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return list_unit_files_by_patterns(message, userdata, error, NULL, NULL);
+}
+
+static int method_list_unit_files_by_patterns(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **states = NULL;
+ _cleanup_strv_free_ char **patterns = NULL;
+ int r;
+
+ r = sd_bus_message_read_strv(message, &states);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_strv(message, &patterns);
+ if (r < 0)
+ return r;
+
+ return list_unit_files_by_patterns(message, userdata, error, states, patterns);
+}
+
+static int method_get_unit_file_state(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ UnitFileState state;
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = unit_file_get_state(m->runtime_scope, NULL, name, &state);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", unit_file_state_to_string(state));
+}
+
+static int method_get_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *default_target = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ /* Anyone can call this method */
+
+ r = mac_selinux_access_check(message, "status", error);
+ if (r < 0)
+ return r;
+
+ r = unit_file_get_default(m->runtime_scope, NULL, &default_target);
+ if (r == -ERFKILL)
+ sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit file is masked.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, "s", default_target);
+}
+
+static int send_unit_files_changed(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus, &message,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitFilesChanged");
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+/* Create an error reply, using the error information from changes[]
+ * if possible, and fall back to generating an error from error code c.
+ * The error message only describes the first error.
+ */
+static int install_error(
+ sd_bus_error *error,
+ int c,
+ InstallChange *changes,
+ size_t n_changes) {
+
+ CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+ for (size_t i = 0; i < n_changes; i++)
+
+ /* When making changes here, make sure to also change install_changes_dump() in install.c. */
+
+ switch (changes[i].type) {
+ case 0 ... _INSTALL_CHANGE_TYPE_MAX: /* not errors */
+ break;
+
+ case -EEXIST:
+ if (changes[i].source)
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "File %s already exists and is a symlink to %s.",
+ changes[i].path, changes[i].source);
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
+ "File %s already exists.",
+ changes[i].path);
+
+ case -ERFKILL:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED,
+ "Unit file %s is masked.", changes[i].path);
+
+ case -EADDRNOTAVAIL:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_GENERATED,
+ "Unit %s is transient or generated.", changes[i].path);
+
+ case -ETXTBSY:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_BAD_PATH,
+ "File %s is under the systemd unit hierarchy already.", changes[i].path);
+
+ case -EBADSLT:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Invalid specifier in %s.", changes[i].path);
+
+ case -EIDRM:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Destination unit %s is a non-template unit.", changes[i].path);
+
+ case -EUCLEAN:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "\"%s\" is not a valid unit name.",
+ changes[i].path);
+
+ case -ELOOP:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_LINKED,
+ "Refusing to operate on alias name or linked unit file: %s",
+ changes[i].path);
+
+ case -EXDEV:
+ if (changes[i].source)
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Cannot alias %s as %s.",
+ changes[i].source, changes[i].path);
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Invalid unit reference %s.", changes[i].path);
+
+ case -ENOENT:
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT,
+ "Unit file %s does not exist.", changes[i].path);
+
+ case -EUNATCH:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Cannot resolve specifiers in %s.", changes[i].path);
+
+ default:
+ assert(changes[i].type < 0); /* other errors */
+ return sd_bus_error_set_errnof(error, changes[i].type, "File %s: %m", changes[i].path);
+ }
+
+ return c < 0 ? c : -EINVAL;
+}
+
+static int reply_install_changes_and_free(
+ Manager *m,
+ sd_bus_message *message,
+ int carries_install_info,
+ InstallChange *changes,
+ size_t n_changes,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ bool bad = false, good = false;
+ int r;
+
+ CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+ if (install_changes_have_modification(changes, n_changes)) {
+ r = bus_foreach_bus(m, NULL, send_unit_files_changed, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send UnitFilesChanged signal: %m");
+ }
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ if (carries_install_info >= 0) {
+ r = sd_bus_message_append(reply, "b", carries_install_info);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_open_container(reply, 'a', "(sss)");
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < n_changes; i++) {
+
+ if (changes[i].type < 0) {
+ bad = true;
+ continue;
+ }
+
+ r = sd_bus_message_append(
+ reply, "(sss)",
+ install_change_type_to_string(changes[i].type),
+ changes[i].path,
+ changes[i].source);
+ if (r < 0)
+ return r;
+
+ good = true;
+ }
+
+ /* If there was a failed change, and no successful change, then return the first failure as proper
+ * method call error. */
+ if (bad && !good)
+ return install_error(error, 0, TAKE_PTR(changes), n_changes);
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_enable_unit_files_generic(
+ sd_bus_message *message,
+ Manager *m,
+ int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+ bool carries_install_info,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ UnitFileFlags flags;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_is_method_call(message, NULL, "EnableUnitFilesWithFlags")) {
+ uint64_t raw_flags;
+
+ r = sd_bus_message_read(message, "t", &raw_flags);
+ if (r < 0)
+ return r;
+ if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0)
+ return -EINVAL;
+ flags = raw_flags;
+ } else {
+ int runtime, force;
+
+ r = sd_bus_message_read(message, "bb", &runtime, &force);
+ if (r < 0)
+ return r;
+ flags = unit_file_bools_to_flags(runtime, force);
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error);
+}
+
+static int method_enable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error);
+}
+
+static int method_enable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_enable, /* carries_install_info = */ true, error);
+}
+
+static int method_reenable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_reenable, /* carries_install_info = */ true, error);
+}
+
+static int method_link_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_link, /* carries_install_info = */ false, error);
+}
+
+static int unit_file_preset_without_mode(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char **files, InstallChange **changes, size_t *n_changes) {
+ return unit_file_preset(scope, flags, root_dir, files, UNIT_FILE_PRESET_FULL, changes, n_changes);
+}
+
+static int method_preset_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_preset_without_mode, /* carries_install_info = */ true, error);
+}
+
+static int method_mask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_enable_unit_files_generic(message, userdata, unit_file_mask, /* carries_install_info = */ false, error);
+}
+
+static int method_preset_unit_files_with_mode(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFilePresetMode preset_mode;
+ int runtime, force, r;
+ UnitFileFlags flags;
+ const char *mode;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ if (isempty(mode))
+ preset_mode = UNIT_FILE_PRESET_FULL;
+ else {
+ preset_mode = unit_file_preset_mode_from_string(mode);
+ if (preset_mode < 0)
+ return -EINVAL;
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_preset(m->runtime_scope, flags, NULL, l, preset_mode, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, r, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_generic(
+ sd_bus_message *message,
+ Manager *m,
+ int (*call)(RuntimeScope scope, UnitFileFlags flags, const char *root_dir, char *files[], InstallChange **changes, size_t *n_changes),
+ bool carries_install_info,
+ sd_bus_error *error) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ UnitFileFlags flags;
+ size_t n_changes = 0;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ if (sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlags") ||
+ sd_bus_message_is_method_call(message, NULL, "DisableUnitFilesWithFlagsAndInstallInfo")) {
+ uint64_t raw_flags;
+
+ r = sd_bus_message_read(message, "t", &raw_flags);
+ if (r < 0)
+ return r;
+ if ((raw_flags & ~_UNIT_FILE_FLAGS_MASK_PUBLIC) != 0 ||
+ FLAGS_SET(raw_flags, UNIT_FILE_FORCE))
+ return -EINVAL;
+ flags = raw_flags;
+ } else {
+ int runtime;
+
+ r = sd_bus_message_read(message, "b", &runtime);
+ if (r < 0)
+ return r;
+ flags = unit_file_bools_to_flags(runtime, false);
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = call(m->runtime_scope, flags, NULL, l, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, carries_install_info ? r : -1, changes, n_changes, error);
+}
+
+static int method_disable_unit_files_with_flags(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error);
+}
+
+static int method_disable_unit_files_with_flags_and_install_info(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ true, error);
+}
+
+static int method_disable_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_disable, /* carries_install_info = */ false, error);
+}
+
+static int method_unmask_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_disable_unit_files_generic(message, userdata, unit_file_unmask, /* carries_install_info = */ false, error);
+}
+
+static int method_revert_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_revert(m->runtime_scope, NULL, l, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_set_default_target(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ int force, r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "enable", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sb", &name, &force);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_set_default(m->runtime_scope, force ? UNIT_FILE_FORCE : 0, NULL, name, &changes, &n_changes);
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_preset_all_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ Manager *m = ASSERT_PTR(userdata);
+ UnitFilePresetMode preset_mode;
+ const char *mode;
+ UnitFileFlags flags;
+ int force, runtime, r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "enable", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "sbb", &mode, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ if (isempty(mode))
+ preset_mode = UNIT_FILE_PRESET_FULL;
+ else {
+ preset_mode = unit_file_preset_mode_from_string(mode);
+ if (preset_mode < 0)
+ return -EINVAL;
+ }
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_file_preset_all(m->runtime_scope, flags, NULL, preset_mode, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_add_dependency_unit_files(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ int runtime, force, r;
+ char *target, *type;
+ UnitDependency dep;
+ UnitFileFlags flags;
+
+ assert(message);
+
+ r = bus_verify_manage_unit_files_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssbb", &target, &type, &runtime, &force);
+ if (r < 0)
+ return r;
+
+ flags = unit_file_bools_to_flags(runtime, force);
+
+ dep = unit_dependency_from_string(type);
+ if (dep < 0)
+ return -EINVAL;
+
+ r = unit_file_add_dependency(m->runtime_scope, flags, NULL, l, target, dep, &changes, &n_changes);
+ m->unit_file_state_outdated = m->unit_file_state_outdated || n_changes > 0; /* See comments for this variable in manager.h */
+ if (r < 0)
+ return install_error(error, r, changes, n_changes);
+
+ return reply_install_changes_and_free(m, message, -1, changes, n_changes, error);
+}
+
+static int method_get_unit_file_links(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ InstallChange *changes = NULL;
+ size_t n_changes = 0, i;
+ const char *name;
+ int runtime, r;
+
+ CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+ r = sd_bus_message_read(message, "sb", &name, &runtime);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, SD_BUS_TYPE_ARRAY, "s");
+ if (r < 0)
+ return r;
+
+ r = unit_file_disable(m->runtime_scope,
+ UNIT_FILE_DRY_RUN | (runtime ? UNIT_FILE_RUNTIME : 0),
+ NULL, STRV_MAKE(name), &changes, &n_changes);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get file links for %s: %m", name);
+
+ for (i = 0; i < n_changes; i++)
+ if (changes[i].type == INSTALL_CHANGE_UNLINK) {
+ r = sd_bus_message_append(reply, "s", changes[i].path);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_get_job_waiting(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ uint32_t id;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "u", &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_JOB, "Job %u does not exist.", (unsigned) id);
+
+ return bus_job_method_get_waiting_jobs(message, j, error);
+}
+
+static int method_abandon_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ return r;
+
+ r = bus_get_unit_by_name(m, message, name, &u, error);
+ if (r < 0)
+ return r;
+
+ if (u->type != UNIT_SCOPE)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit '%s' is not a scope unit, refusing.", name);
+
+ return bus_scope_method_abandon(message, u, error);
+}
+
+static int method_set_show_status(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ ShowStatus mode = _SHOW_STATUS_INVALID;
+ const char *t;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_access_check(message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_set_environment_async(m, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = sd_bus_message_read(message, "s", &t);
+ if (r < 0)
+ return r;
+
+ if (!isempty(t)) {
+ mode = show_status_from_string(t);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid show status '%s'", t);
+ }
+
+ manager_override_show_status(m, mode, "bus");
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int method_dump_unit_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0);
+}
+
+const sd_bus_vtable bus_manager_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_PROPERTY("Version", "s", property_get_version, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Features", "s", property_get_features, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Virtualization", "s", property_get_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConfidentialVirtualization", "s", property_get_confidential_virtualization, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Architecture", "s", property_get_architecture, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Tainted", "s", property_get_tainted, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("FirmwareTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FIRMWARE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("LoaderTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_LOADER]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("KernelTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_KERNEL]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UserspaceTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_USERSPACE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("FinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("SecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("SecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("GeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("UnitsLoadTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_UNITS_LOAD]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDSecurityFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDGeneratorsFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadStartTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START]), SD_BUS_VTABLE_PROPERTY_CONST),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InitRDUnitsLoadFinishTimestamp", offsetof(Manager, timestamps[MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+ SD_BUS_PROPERTY("NNames", "u", property_get_hashmap_size, offsetof(Manager, units), 0),
+ SD_BUS_PROPERTY("NFailedUnits", "u", property_get_set_size, offsetof(Manager, failed_units), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NJobs", "u", property_get_hashmap_size, offsetof(Manager, jobs), 0),
+ SD_BUS_PROPERTY("NInstalledJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_installed_jobs), 0),
+ SD_BUS_PROPERTY("NFailedJobs", "u", bus_property_get_unsigned, offsetof(Manager, n_failed_jobs), 0),
+ SD_BUS_PROPERTY("Progress", "d", property_get_progress, 0, 0),
+ SD_BUS_PROPERTY("Environment", "as", property_get_environment, 0, 0),
+ SD_BUS_PROPERTY("ConfirmSpawn", "b", bus_property_get_bool, offsetof(Manager, confirm_spawn), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ShowStatus", "b", property_get_show_status, 0, 0),
+ SD_BUS_PROPERTY("UnitPath", "as", NULL, offsetof(Manager, lookup_paths.search_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStandardOutput", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_output), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStandardError", "s", bus_property_get_exec_output, offsetof(Manager, defaults.std_error), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogDevice", "s", property_get_watchdog_device, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogLastPingTimestamp", "t", property_get_watchdog_last_ping_realtime, 0, 0),
+ SD_BUS_PROPERTY("WatchdogLastPingTimestampMonotonic", "t", property_get_watchdog_last_ping_monotonic, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogUSec", "t", property_get_runtime_watchdog, property_set_runtime_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreUSec", "t", property_get_pretimeout_watchdog, property_set_pretimeout_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RuntimeWatchdogPreGovernor", "s", property_get_pretimeout_watchdog_governor, property_set_pretimeout_watchdog_governor, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("RebootWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, 0),
+ /* The following item is an obsolete alias */
+ SD_BUS_WRITABLE_PROPERTY("ShutdownWatchdogUSec", "t", property_get_reboot_watchdog, property_set_reboot_watchdog, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_WRITABLE_PROPERTY("KExecWatchdogUSec", "t", property_get_kexec_watchdog, property_set_kexec_watchdog, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("ServiceWatchdogs", "b", bus_property_get_bool, bus_property_set_bool, offsetof(Manager, service_watchdogs), 0),
+ SD_BUS_PROPERTY("ControlGroup", "s", NULL, offsetof(Manager, cgroup_root), 0),
+ SD_BUS_PROPERTY("SystemState", "s", property_get_system_state, 0, 0),
+ SD_BUS_PROPERTY("ExitCode", "y", bus_property_get_unsigned, offsetof(Manager, return_value), 0),
+ SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0),
+ SD_BUS_PROPERTY("DefaultDeviceTimeoutUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.device_timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ /* The following two items are obsolete alias */
+ SD_BUS_PROPERTY("DefaultStartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("DefaultStartLimitInterval", "t", bus_property_get_usec, offsetof(Manager, defaults.start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("DefaultStartLimitBurst", "u", bus_property_get_unsigned, offsetof(Manager, defaults.start_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultCPUAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.cpu_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultBlockIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.blockio_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultIOAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.io_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultIPAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.ip_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultMemoryAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.memory_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTasksAccounting", "b", bus_property_get_bool, offsetof(Manager, defaults.tasks_accounting), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCPU", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCPUSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CPU]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitFSIZE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitFSIZESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_FSIZE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitDATA", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitDATASoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_DATA]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSTACK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSTACKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_STACK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCORE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitCORESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_CORE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRSS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRSSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RSS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNOFILE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNOFILESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NOFILE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitAS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitASSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_AS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNPROC", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNPROCSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NPROC]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMEMLOCK", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMEMLOCKSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MEMLOCK]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitLOCKS", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitLOCKSSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_LOCKS]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSIGPENDING", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitSIGPENDINGSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_SIGPENDING]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMSGQUEUE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitMSGQUEUESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_MSGQUEUE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNICE", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitNICESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_NICE]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTPRIO", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, defaults.rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, defaults.tasks_max), 0),
+ SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.memory_pressure_threshold_usec), 0),
+ SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.memory_pressure_watch), 0),
+ SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CtrlAltDelBurstAction", "s", bus_property_get_emergency_action, offsetof(Manager, cad_burst_action), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ SD_BUS_METHOD_WITH_ARGS("GetUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByPID",
+ SD_BUS_ARGS("u", pid),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_pid,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByInvocationID",
+ SD_BUS_ARGS("ay", invocation_id),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_invocation_id,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByControlGroup",
+ SD_BUS_ARGS("s", cgroup),
+ SD_BUS_RESULT("o", unit),
+ method_get_unit_by_control_group,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitByPIDFD",
+ SD_BUS_ARGS("h", pidfd),
+ SD_BUS_RESULT("o", unit, "s", unit_id, "ay", invocation_id),
+ method_get_unit_by_pidfd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LoadUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("o", unit),
+ method_load_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_start_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnitWithFlags",
+ SD_BUS_ARGS("s", name, "s", mode, "t", flags),
+ SD_BUS_RESULT("o", job),
+ method_start_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartUnitReplace",
+ SD_BUS_ARGS("s", old_unit, "s", new_unit, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_start_unit_replace,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StopUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_stop_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("TryRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_try_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_or_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestartUnit",
+ SD_BUS_ARGS("s", name, "s", mode),
+ SD_BUS_RESULT("o", job),
+ method_reload_or_try_restart_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueUnitJob",
+ SD_BUS_ARGS("s", name, "s", job_type, "s", job_mode),
+ SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+ method_enqueue_unit_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("KillUnit",
+ SD_BUS_ARGS("s", name, "s", whom, "i", signal),
+ SD_BUS_NO_RESULT,
+ method_kill_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("QueueSignalUnit",
+ SD_BUS_ARGS("s", name, "s", whom, "i", signal, "i", value),
+ SD_BUS_NO_RESULT,
+ method_kill_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CleanUnit",
+ SD_BUS_ARGS("s", name, "as", mask),
+ SD_BUS_NO_RESULT,
+ method_clean_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("FreezeUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_freeze_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ThawUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_thaw_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ResetFailedUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_reset_failed_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetUnitProperties",
+ SD_BUS_ARGS("s", name, "b", runtime, "a(sv)", properties),
+ SD_BUS_NO_RESULT,
+ method_set_unit_properties,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("BindMountUnit",
+ SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir),
+ SD_BUS_NO_RESULT,
+ method_bind_mount_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("MountImageUnit",
+ SD_BUS_ARGS("s", name, "s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+ SD_BUS_NO_RESULT,
+ method_mount_image_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RefUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_ref_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnrefUnit",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_unref_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("StartTransientUnit",
+ SD_BUS_ARGS("s", name, "s", mode, "a(sv)", properties, "a(sa(sv))", aux),
+ SD_BUS_RESULT("o", job),
+ method_start_transient_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitProcesses",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("a(sus)", processes),
+ method_get_unit_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AttachProcessesToUnit",
+ SD_BUS_ARGS("s", unit_name, "s", subcgroup, "au", pids),
+ SD_BUS_NO_RESULT,
+ method_attach_processes_to_unit,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AbandonScope",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_abandon_scope,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJob",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("o", job),
+ method_get_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJobAfter",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_get_job_waiting,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetJobBefore",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_get_job_waiting,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CancelJob",
+ SD_BUS_ARGS("u", id),
+ SD_BUS_NO_RESULT,
+ method_cancel_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ClearJobs",
+ NULL,
+ NULL,
+ method_clear_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ResetFailed",
+ NULL,
+ NULL,
+ method_reset_failed,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetShowStatus",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_NO_RESULT,
+ method_set_show_status,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnits",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsFiltered",
+ SD_BUS_ARGS("as", states),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_filtered,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsByPatterns",
+ SD_BUS_ARGS("as", states, "as", patterns),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_by_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitsByNames",
+ SD_BUS_ARGS("as", names),
+ SD_BUS_RESULT("a(ssssssouso)", units),
+ method_list_units_by_names,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListJobs",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(usssoo)", jobs),
+ method_list_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Subscribe",
+ NULL,
+ NULL,
+ method_subscribe,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Unsubscribe",
+ NULL,
+ NULL,
+ method_unsubscribe,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Dump",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("s", output),
+ method_dump,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatterns",
+ SD_BUS_ARGS("as", patterns),
+ SD_BUS_RESULT("s", output),
+ method_dump_units_matching_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpByFileDescriptor",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("h", fd),
+ method_dump_by_fd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpUnitsMatchingPatternsByFileDescriptor",
+ SD_BUS_ARGS("as", patterns),
+ SD_BUS_RESULT("h", fd),
+ method_dump_units_matching_patterns_by_fd,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("CreateSnapshot",
+ SD_BUS_ARGS("s", name, "b", cleanup),
+ SD_BUS_RESULT("o", unit),
+ method_refuse_snapshot,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_METHOD_WITH_ARGS("RemoveSnapshot",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_NO_RESULT,
+ method_refuse_snapshot,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_METHOD("Reload",
+ NULL,
+ NULL,
+ method_reload,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Reexecute",
+ NULL,
+ NULL,
+ method_reexecute,
+ SD_BUS_VTABLE_UNPRIVILEGED|SD_BUS_VTABLE_METHOD_NO_REPLY),
+ SD_BUS_METHOD("Exit",
+ NULL,
+ NULL,
+ method_exit,
+ 0),
+ SD_BUS_METHOD("Reboot",
+ NULL,
+ NULL,
+ method_reboot,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD_WITH_ARGS("SoftReboot",
+ SD_BUS_ARGS("s", new_root),
+ SD_BUS_NO_RESULT,
+ method_soft_reboot,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("PowerOff",
+ NULL,
+ NULL,
+ method_poweroff,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("Halt",
+ NULL,
+ NULL,
+ method_halt,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD("KExec",
+ NULL,
+ NULL,
+ method_kexec,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD_WITH_ARGS("SwitchRoot",
+ SD_BUS_ARGS("s", new_root, "s", init),
+ SD_BUS_NO_RESULT,
+ method_switch_root,
+ SD_BUS_VTABLE_CAPABILITY(CAP_SYS_BOOT)),
+ SD_BUS_METHOD_WITH_ARGS("SetEnvironment",
+ SD_BUS_ARGS("as", assignments),
+ SD_BUS_NO_RESULT,
+ method_set_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnsetEnvironment",
+ SD_BUS_ARGS("as", names),
+ SD_BUS_NO_RESULT,
+ method_unset_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnsetAndSetEnvironment",
+ SD_BUS_ARGS("as", names, "as", assignments),
+ SD_BUS_NO_RESULT,
+ method_unset_and_set_environment,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueMarkedJobs",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("ao", jobs),
+ method_enqueue_marked_jobs,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitFiles",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(ss)", unit_files),
+ method_list_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ListUnitFilesByPatterns",
+ SD_BUS_ARGS("as", states, "as", patterns),
+ SD_BUS_RESULT("a(ss)", unit_files),
+ method_list_unit_files_by_patterns,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitFileState",
+ SD_BUS_ARGS("s", file),
+ SD_BUS_RESULT("s", state),
+ method_get_unit_file_state,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_enable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DisableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_disable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnableUnitFilesWithFlags",
+ SD_BUS_ARGS("as", files, "t", flags),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_enable_unit_files_with_flags,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlags",
+ SD_BUS_ARGS("as", files, "t", flags),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_disable_unit_files_with_flags,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DisableUnitFilesWithFlagsAndInstallInfo",
+ SD_BUS_ARGS("as", files, "t", flags),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_disable_unit_files_with_flags_and_install_info,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReenableUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_reenable_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LinkUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_link_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_preset_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetUnitFilesWithMode",
+ SD_BUS_ARGS("as", files, "s", mode, "b", runtime, "b", force),
+ SD_BUS_RESULT("b", carries_install_info, "a(sss)", changes),
+ method_preset_unit_files_with_mode,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("MaskUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_mask_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("UnmaskUnitFiles",
+ SD_BUS_ARGS("as", files, "b", runtime),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_unmask_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("RevertUnitFiles",
+ SD_BUS_ARGS("as", files),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_revert_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetDefaultTarget",
+ SD_BUS_ARGS("s", name, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_set_default_target,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetDefaultTarget",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("s", name),
+ method_get_default_target,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("PresetAllUnitFiles",
+ SD_BUS_ARGS("s", mode, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_preset_all_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("AddDependencyUnitFiles",
+ SD_BUS_ARGS("as", files, "s", target, "s", type, "b", runtime, "b", force),
+ SD_BUS_RESULT("a(sss)", changes),
+ method_add_dependency_unit_files,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetUnitFileLinks",
+ SD_BUS_ARGS("s", name, "b", runtime),
+ SD_BUS_RESULT("as", links),
+ method_get_unit_file_links,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetExitCode",
+ SD_BUS_ARGS("y", number),
+ SD_BUS_NO_RESULT,
+ method_set_exit_code,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByName",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("u", uid),
+ method_lookup_dynamic_user_by_name,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("LookupDynamicUserByUID",
+ SD_BUS_ARGS("u", uid),
+ SD_BUS_RESULT("s", name),
+ method_lookup_dynamic_user_by_uid,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("GetDynamicUsers",
+ SD_BUS_NO_ARGS,
+ SD_BUS_RESULT("a(us)", users),
+ method_get_dynamic_users,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("DumpUnitFileDescriptorStore",
+ SD_BUS_ARGS("s", name),
+ SD_BUS_RESULT("a(suuutuusu)", entries),
+ method_dump_unit_descriptor_store,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_SIGNAL_WITH_ARGS("UnitNew",
+ SD_BUS_ARGS("s", id, "o", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("UnitRemoved",
+ SD_BUS_ARGS("s", id, "o", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("JobNew",
+ SD_BUS_ARGS("u", id, "o", job, "s", unit),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("JobRemoved",
+ SD_BUS_ARGS("u", id, "o", job, "s", unit, "s", result),
+ 0),
+ SD_BUS_SIGNAL_WITH_ARGS("StartupFinished",
+ SD_BUS_ARGS("t", firmware, "t", loader, "t", kernel, "t", initrd, "t", userspace, "t", total),
+ 0),
+ SD_BUS_SIGNAL("UnitFilesChanged", NULL, 0),
+ SD_BUS_SIGNAL_WITH_ARGS("Reloading",
+ SD_BUS_ARGS("b", active),
+ 0),
+
+ SD_BUS_VTABLE_END
+};
+
+const sd_bus_vtable bus_manager_log_control_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ /* We define a private version of this interface here, since we want slightly different
+ * implementations for the setters. We'll still use the generic getters however, and we share the
+ * setters with the implementations for the Manager interface above (which pre-dates the generic
+ * service API interface). */
+
+ SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, property_set_log_level, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, property_set_log_target, 0, 0),
+ SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0),
+
+ SD_BUS_VTABLE_END,
+};
+
+static int send_finished(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ usec_t *times = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus,
+ &message,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "StartupFinished");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(message, "tttttt", times[0], times[1], times[2], times[3], times[4], times[5]);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_finished(
+ Manager *m,
+ usec_t firmware_usec,
+ usec_t loader_usec,
+ usec_t kernel_usec,
+ usec_t initrd_usec,
+ usec_t userspace_usec,
+ usec_t total_usec) {
+
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(
+ m,
+ NULL,
+ send_finished,
+ (usec_t[6]) {
+ firmware_usec,
+ loader_usec,
+ kernel_usec,
+ initrd_usec,
+ userspace_usec,
+ total_usec
+ });
+ if (r < 0)
+ log_debug_errno(r, "Failed to send finished signal: %m");
+}
+
+static int send_reloading(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *message = NULL;
+ int r;
+
+ assert(bus);
+
+ r = sd_bus_message_new_signal(bus, &message, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager", "Reloading");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(message, "b", PTR_TO_INT(userdata));
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, message, NULL);
+}
+
+void bus_manager_send_reloading(Manager *m, bool active) {
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(m, NULL, send_reloading, INT_TO_PTR(active));
+ if (r < 0)
+ log_debug_errno(r, "Failed to send reloading signal: %m");
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ assert(bus);
+
+ return sd_bus_emit_properties_changed_strv(bus,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ NULL);
+}
+
+void bus_manager_send_change_signal(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = bus_foreach_bus(m, NULL, send_changed_signal, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to send manager change signal: %m");
+}
diff --git a/src/core/dbus-manager.h b/src/core/dbus-manager.h
new file mode 100644
index 0000000..9b05080
--- /dev/null
+++ b/src/core/dbus-manager.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+#include "manager.h"
+
+extern const sd_bus_vtable bus_manager_vtable[];
+extern const sd_bus_vtable bus_manager_log_control_vtable[];
+
+void bus_manager_send_finished(Manager *m, usec_t firmware_usec, usec_t loader_usec, usec_t kernel_usec, usec_t initrd_usec, usec_t userspace_usec, usec_t total_usec);
+void bus_manager_send_reloading(Manager *m, bool active);
+void bus_manager_send_change_signal(Manager *m);
+
+int verify_run_space_and_log(const char *message);
+
+int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
+int bus_property_get_emergency_action(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c
new file mode 100644
index 0000000..7dbbdd0
--- /dev/null
+++ b/src/core/dbus-mount.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-mount.h"
+#include "dbus-util.h"
+#include "mount.h"
+#include "string-util.h"
+#include "unit.h"
+#include "utf8.h"
+
+static int property_get_what(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *escaped = NULL;
+ Mount *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ escaped = mount_get_what_escaped(m);
+ if (!escaped)
+ return -ENOMEM;
+
+ return sd_bus_message_append_basic(reply, 's', escaped);
+}
+
+static int property_get_options(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *escaped = NULL;
+ Mount *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ escaped = mount_get_options_escaped(m);
+ if (!escaped)
+ return -ENOMEM;
+
+ return sd_bus_message_append_basic(reply, 's', escaped);
+}
+
+static BUS_DEFINE_PROPERTY_GET(property_get_type, "s", Mount, mount_get_fstype);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, mount_result, MountResult);
+
+const sd_bus_vtable bus_mount_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Where", "s", NULL, offsetof(Mount, where), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("What", "s", property_get_what, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Options","s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Type", "s", property_get_type, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Mount, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Mount, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LazyUnmount", "b", bus_property_get_bool, offsetof(Mount, lazy_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ForceUnmount", "b", bus_property_get_bool, offsetof(Mount, force_unmount), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReadWriteOnly", "b", bus_property_get_bool, offsetof(Mount, read_write_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_mount_set_transient_property(
+ Mount *m,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(m);
+
+ assert(m);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Where"))
+ return bus_set_transient_path(u, name, &m->where, message, flags, error);
+
+ if (streq(name, "What"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.what, message, flags, error);
+
+ if (streq(name, "Options"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.options, message, flags, error);
+
+ if (streq(name, "Type"))
+ return bus_set_transient_string(u, name, &m->parameters_fragment.fstype, message, flags, error);
+
+ if (streq(name, "TimeoutUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &m->timeout_usec, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &m->directory_mode, message, flags, error);
+
+ if (streq(name, "SloppyOptions"))
+ return bus_set_transient_bool(u, name, &m->sloppy_options, message, flags, error);
+
+ if (streq(name, "LazyUnmount"))
+ return bus_set_transient_bool(u, name, &m->lazy_unmount, message, flags, error);
+
+ if (streq(name, "ForceUnmount"))
+ return bus_set_transient_bool(u, name, &m->force_unmount, message, flags, error);
+
+ if (streq(name, "ReadWriteOnly"))
+ return bus_set_transient_bool(u, name, &m->read_write_only, message, flags, error);
+
+ return 0;
+}
+
+int bus_mount_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &m->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's load a little more */
+
+ r = bus_mount_set_transient_property(m, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &m->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &m->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_mount_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-mount.h b/src/core/dbus-mount.h
new file mode 100644
index 0000000..5a848d3
--- /dev/null
+++ b/src/core/dbus-mount.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_mount_vtable[];
+
+int bus_mount_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_mount_commit_properties(Unit *u);
diff --git a/src/core/dbus-path.c b/src/core/dbus-path.c
new file mode 100644
index 0000000..8cb6a26
--- /dev/null
+++ b/src/core/dbus-path.c
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-path.h"
+#include "dbus-util.h"
+#include "list.h"
+#include "path.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, path_result, PathResult);
+
+static int property_get_paths(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Path *p = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(spec, k, p->specs) {
+ r = sd_bus_message_append(reply, "(ss)", path_type_to_string(k->type), k->path);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_path_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Paths", "a(ss)", property_get_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MakeDirectory", "b", bus_property_get_bool, offsetof(Path, make_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Path, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Path, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Path, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Path, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_path_set_transient_property(
+ Path *p,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(p);
+ int r;
+
+ assert(p);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "MakeDirectory"))
+ return bus_set_transient_bool(u, name, &p->make_directory, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &p->directory_mode, message, flags, error);
+
+ if (streq(name, "Paths")) {
+ const char *type_name, *path;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &type_name, &path)) > 0) {
+ PathType t;
+
+ t = path_type_from_string(type_name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown path type: %s", type_name);
+
+ if (isempty(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is empty", type_name);
+
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in %s is not absolute: %s", type_name, path);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *k = NULL;
+
+ r = path_simplify_alloc(path, &k);
+ if (r < 0)
+ return r;
+
+ PathSpec *s = new(PathSpec, 1);
+ if (!s)
+ return -ENOMEM;
+
+ *s = (PathSpec) {
+ .unit = u,
+ .path = TAKE_PTR(k),
+ .type = t,
+ .inotify_fd = -EBADF,
+ };
+
+ LIST_PREPEND(spec, p->specs, s);
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", type_name, path);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ path_free_specs(p);
+ unit_write_settingf(u, flags, name, "PathExists=");
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "TriggerLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &p->trigger_limit.burst, message, flags, error);
+
+ if (streq(name, "TriggerLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &p->trigger_limit.interval, message, flags, error);
+
+ return 0;
+}
+
+int bus_path_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags mode,
+ sd_bus_error *error) {
+
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB)
+ return bus_path_set_transient_property(p, name, message, mode, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-path.h b/src/core/dbus-path.h
new file mode 100644
index 0000000..b5018b0
--- /dev/null
+++ b/src/core/dbus-path.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_path_vtable[];
+
+int bus_path_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
new file mode 100644
index 0000000..78196a1
--- /dev/null
+++ b/src/core/dbus-scope.c
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "scope.h"
+#include "selinux-access.h"
+#include "unit.h"
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Scope *s = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(UNIT(s), message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async(UNIT(s)->manager, message, error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = scope_abandon(s);
+ if (r == -ESTALE)
+ return sd_bus_error_setf(error, BUS_ERROR_SCOPE_NOT_RUNNING, "Scope %s is not running, cannot abandon.", UNIT(s)->id);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+
+const sd_bus_vtable bus_scope_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Controller", "s", NULL, offsetof(Scope, controller), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Scope, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Scope, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_SIGNAL("RequestStop", NULL, 0),
+ SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_method_abandon, SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_scope_set_transient_property(
+ Scope *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "TimeoutStopUSec"))
+ return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+ if (streq(name, "RuntimeMaxUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+ if (streq(name, "RuntimeRandomizedExtraUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+ if (streq(name, "OOMPolicy"))
+ return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+ if (streq(name, "PIDs")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "u");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ uint32_t upid;
+ pid_t pid;
+
+ r = sd_bus_message_read(message, "u", &upid);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (upid == 0) {
+ if (!creds) {
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ } else
+ pid = (uid_t) upid;
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_pid_attachable(u, &pidref, error);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return n <= 0 ? -EINVAL : 1;
+ }
+
+ if (streq(name, "PIDFDs")) {
+ unsigned n = 0;
+
+ r = sd_bus_message_enter_container(message, 'a', "h");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ int fd;
+
+ r = sd_bus_message_read(message, "h", &fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = pidref_set_pidfd(&pidref, fd);
+ if (r < 0)
+ return r;
+
+ r = unit_pid_attachable(u, &pidref, error);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ n++;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return n <= 0 ? -EINVAL : 1;
+ }
+
+ if (streq(name, "Controller")) {
+ const char *controller;
+
+ /* We can't support direct connections with this, as direct connections know no service or unique name
+ * concept, but the Controller field stores exactly that. */
+ if (sd_bus_message_get_bus(message) != u->manager->api_bus)
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Sorry, Controller= logic only supported via the bus.");
+
+ r = sd_bus_message_read(message, "s", &controller);
+ if (r < 0)
+ return r;
+
+ if (!isempty(controller) && !sd_bus_service_name_is_valid(controller))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Controller '%s' is not a valid bus name.", controller);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = free_and_strdup(&s->controller, empty_to_null(controller));
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_scope_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->load_state == UNIT_STUB) {
+ /* While we are created we still accept PIDs */
+
+ r = bus_scope_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (streq(name, "User"))
+ return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+ if (streq(name, "Group"))
+ return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+ }
+
+ return 0;
+}
+
+int bus_scope_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
+
+int bus_scope_send_request_stop(Scope *s) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(s);
+
+ if (!s->controller)
+ return 0;
+
+ p = unit_dbus_path(UNIT(s));
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ UNIT(s)->manager->api_bus,
+ &m,
+ p,
+ "org.freedesktop.systemd1.Scope",
+ "RequestStop");
+ if (r < 0)
+ return r;
+
+ return sd_bus_send_to(UNIT(s)->manager->api_bus, m, s->controller, NULL);
+}
+
+static int on_controller_gone(sd_bus_track *track, void *userdata) {
+ Scope *s = userdata;
+
+ assert(track);
+
+ if (s->controller) {
+ log_unit_debug(UNIT(s), "Controller %s disappeared from bus.", s->controller);
+ unit_add_to_dbus_queue(UNIT(s));
+ s->controller = mfree(s->controller);
+ }
+
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ return 0;
+}
+
+int bus_scope_track_controller(Scope *s) {
+ int r;
+
+ assert(s);
+
+ if (!s->controller || s->controller_track)
+ return 0;
+
+ r = sd_bus_track_new(UNIT(s)->manager->api_bus, &s->controller_track, on_controller_gone, s);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_track_add_name(s->controller_track, s->controller);
+ if (r < 0) {
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+ return r;
+ }
+
+ return 0;
+}
diff --git a/src/core/dbus-scope.h b/src/core/dbus-scope.h
new file mode 100644
index 0000000..8f1bc02
--- /dev/null
+++ b/src/core/dbus-scope.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "scope.h"
+#include "unit.h"
+
+extern const sd_bus_vtable bus_scope_vtable[];
+
+int bus_scope_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_scope_commit_properties(Unit *u);
+
+int bus_scope_send_request_stop(Scope *s);
+
+int bus_scope_method_abandon(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_scope_track_controller(Scope *s);
diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c
new file mode 100644
index 0000000..cc478f4
--- /dev/null
+++ b/src/core/dbus-service.c
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-service.h"
+#include "dbus-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "locale-util.h"
+#include "missing_fcntl.h"
+#include "mount-util.h"
+#include "open-file.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "selinux-access.h"
+#include "service.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_type, service_type, ServiceType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exit_type, service_exit_type, ServiceExitType);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, service_result, ServiceResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, ServiceRestart);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart_mode, service_restart_mode, ServiceRestartMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction);
+static BUS_DEFINE_PROPERTY_GET2(property_get_notify_access, "s", Service, service_get_notify_access, notify_access_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_restart_usec_next, "t", Service, service_restart_usec_next);
+static BUS_DEFINE_PROPERTY_GET(property_get_timeout_abort_usec, "t", Service, service_timeout_abort_usec);
+static BUS_DEFINE_PROPERTY_GET(property_get_watchdog_usec, "t", Service, service_get_watchdog_usec);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+static int property_get_open_files(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ OpenFile **open_files = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(open_files, of, *open_files) {
+ r = sd_bus_message_append(reply, "(sst)", of->path, of->fdname, (uint64_t) of->flags);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_exit_status_set(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const ExitStatusSet *status_set = ASSERT_PTR(userdata);
+ unsigned n;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'r', "aiai");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "i");
+ if (r < 0)
+ return r;
+
+ BITMAP_FOREACH(n, &status_set->status) {
+ assert(n < 256);
+
+ r = sd_bus_message_append_basic(reply, 'i', &n);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "i");
+ if (r < 0)
+ return r;
+
+ BITMAP_FOREACH(n, &status_set->signal) {
+ const char *str;
+
+ str = signal_to_string(n);
+ if (!str)
+ continue;
+
+ r = sd_bus_message_append_basic(reply, 'i', &n);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int bus_service_method_mount(sd_bus_message *message, void *userdata, sd_bus_error *error, bool is_image) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ const char *dest, *src, *propagate_directory;
+ int read_only, make_file_or_directory;
+ Unit *u = ASSERT_PTR(userdata);
+ ExecContext *c;
+ int r;
+
+ assert(message);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return sd_bus_error_set(error, SD_BUS_ERROR_NOT_SUPPORTED, "Adding bind mounts at runtime is only supported for system managers.");
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "ssbb", &src, &dest, &read_only, &make_file_or_directory);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(src) || !path_is_normalized(src))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Source path must be absolute and normalized.");
+
+ if (!is_image && isempty(dest))
+ dest = src;
+ else if (!path_is_absolute(dest) || !path_is_normalized(dest))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Destination path must be absolute and normalized.");
+
+ if (is_image) {
+ r = bus_read_mount_options(message, error, &options, NULL, "");
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ is_image ? "mount-image" : "bind-mount",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to mount on '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ if (u->type != UNIT_SERVICE)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not of type .service");
+
+ /* If it would be dropped at startup time, return an error. The context should always be available, but
+ * there's an assert in exec_needs_mount_namespace, so double-check just in case. */
+ c = unit_get_exec_context(u);
+ if (!c)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot access unit execution context");
+ if (path_startswith_strv(dest, c->inaccessible_paths))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s is not accessible to this unit", dest);
+
+ /* Ensure that the unit was started in a private mount namespace */
+ if (!exec_needs_mount_namespace(c, NULL, unit_get_exec_runtime(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit not running in private mount namespace, cannot activate bind mount");
+
+ PidRef* unit_pid = unit_main_pid(u);
+ if (!pidref_is_set(unit_pid) || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not running");
+
+ propagate_directory = strjoina("/run/systemd/propagate/", u->id);
+ if (is_image)
+ r = mount_image_in_namespace(
+ unit_pid,
+ propagate_directory,
+ "/run/systemd/incoming/",
+ src, dest,
+ read_only,
+ make_file_or_directory,
+ options,
+ c->mount_image_policy ?: &image_policy_service);
+ else
+ r = bind_mount_in_namespace(
+ unit_pid,
+ propagate_directory,
+ "/run/systemd/incoming/",
+ src, dest,
+ read_only,
+ make_file_or_directory);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to mount %s on %s in unit's namespace: %m", src, dest);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_service_method_mount(message, userdata, error, false);
+}
+
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_service_method_mount(message, userdata, error, true);
+}
+
+int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Service *s = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(UNIT(s), message, "status", error);
+ if (r < 0)
+ return r;
+
+ if (s->n_fd_store_max == 0 && s->n_fd_store == 0)
+ return sd_bus_error_setf(error, BUS_ERROR_FILE_DESCRIPTOR_STORE_DISABLED, "File descriptor store not enabled for %s.", UNIT(s)->id);
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(suuutuusu)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(fd_store, i, s->fd_store) {
+ _cleanup_free_ char *path = NULL;
+ struct stat st;
+ int flags;
+
+ if (fstat(i->fd, &st) < 0) {
+ log_debug_errno(errno, "Failed to stat() file descriptor entry '%s', skipping.", strna(i->fdname));
+ continue;
+ }
+
+ flags = fcntl(i->fd, F_GETFL);
+ if (flags < 0) {
+ log_debug_errno(errno, "Failed to issue F_GETFL on file descriptor entry '%s', skipping.", strna(i->fdname));
+ continue;
+ }
+
+ /* glibc implies O_LARGEFILE everywhere on 64-bit off_t builds, but forgets to hide it away on
+ * F_GETFL, but provides no definition to check for that. Let's mask the flag away manually,
+ * to not confuse clients. */
+ flags &= ~RAW_O_LARGEFILE;
+
+ (void) fd_get_path(i->fd, &path);
+
+ r = sd_bus_message_append(
+ reply,
+ "(suuutuusu)",
+ i->fdname,
+ (uint32_t) st.st_mode,
+ (uint32_t) major(st.st_dev), (uint32_t) minor(st.st_dev),
+ (uint64_t) st.st_ino,
+ (uint32_t) major(st.st_rdev), (uint32_t) minor(st.st_rdev),
+ path,
+ (uint32_t) flags);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+#if __SIZEOF_SIZE_T__ == 8
+static int property_get_size_as_uint32(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ size_t *value = ASSERT_PTR(userdata);
+ uint32_t sz = *value >= UINT32_MAX ? UINT32_MAX : (uint32_t) *value;
+
+ /* Returns a size_t as a D-Bus "u" type, i.e. as 32-bit value, even if size_t is 64-bit. We'll saturate if it doesn't fit. */
+
+ return sd_bus_message_append_basic(reply, 'u', &sz);
+}
+#elif __SIZEOF_SIZE_T__ == 4
+#define property_get_size_as_uint32 ((sd_bus_property_get_t) NULL)
+#else
+#error "Unexpected size of size_t"
+#endif
+
+const sd_bus_vtable bus_service_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Type", "s", property_get_type, offsetof(Service, type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ExitType", "s", property_get_exit_type, offsetof(Service, exit_type), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Restart", "s", property_get_restart, offsetof(Service, restart), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartMode", "s", property_get_restart_mode, offsetof(Service, restart_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PIDFile", "s", NULL, offsetof(Service, pid_file), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NotifyAccess", "s", property_get_notify_access, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartSteps", "u", bus_property_get_unsigned, offsetof(Service, restart_steps), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartMaxDelayUSec", "t", bus_property_get_usec, offsetof(Service, restart_max_delay_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartUSecNext", "t", property_get_restart_usec_next, 0, 0),
+ SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0),
+ SD_BUS_PROPERTY("TimeoutStartFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_start_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutStopFailureMode", "s", property_get_timeout_failure_mode, offsetof(Service, timeout_stop_failure_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RuntimeRandomizedExtraUSec", "t", bus_property_get_usec, offsetof(Service, runtime_rand_extra_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WatchdogUSec", "t", property_get_watchdog_usec, 0, 0),
+ BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0),
+ SD_BUS_PROPERTY("PermissionsStartOnly", "b", bus_property_get_bool, offsetof(Service, permissions_start_only), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* 😷 deprecated */
+ SD_BUS_PROPERTY("RootDirectoryStartOnly", "b", bus_property_get_bool, offsetof(Service, root_directory_start_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemainAfterExit", "b", bus_property_get_bool, offsetof(Service, remain_after_exit), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("GuessMainPID", "b", bus_property_get_bool, offsetof(Service, guess_main_pid), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartPreventExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_prevent_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestartForceExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, restart_force_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessExitStatus", "(aiai)", property_get_exit_status_set, offsetof(Service, success_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", bus_property_get_unsigned, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NFileDescriptorStore", "u", property_get_size_as_uint32, offsetof(Service, n_fd_store), 0),
+ SD_BUS_PROPERTY("FileDescriptorStorePreserve", "s", bus_property_get_exec_preserve_mode, offsetof(Service, fd_store_preserve_mode), 0),
+ SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("StatusErrno", "i", bus_property_get_int, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("ReloadResult", "s", property_get_result, offsetof(Service, reload_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CleanResult", "s", property_get_result, offsetof(Service, clean_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OpenFile", "a(sst)", property_get_open_files, offsetof(Service, open_files), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReloadSignal", "i", bus_property_get_int, offsetof(Service, reload_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+
+ BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecCondition", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecConditionEx", offsetof(Service, exec_command[SERVICE_EXEC_CONDITION]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPreEx", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartEx", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStartPostEx", offsetof(Service, exec_command[SERVICE_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecReload", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecReloadEx", offsetof(Service, exec_command[SERVICE_EXEC_RELOAD]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStop", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_EX_COMMAND_LIST_VTABLE("ExecStopPostEx", offsetof(Service, exec_command[SERVICE_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+
+ SD_BUS_METHOD_WITH_ARGS("BindMount",
+ SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir),
+ SD_BUS_NO_RESULT,
+ bus_service_method_bind_mount,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_METHOD_WITH_ARGS("MountImage",
+ SD_BUS_ARGS("s", source, "s", destination, "b", read_only, "b", mkdir, "a(ss)", options),
+ SD_BUS_NO_RESULT,
+ bus_service_method_mount_image,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_METHOD_WITH_ARGS("DumpFileDescriptorStore",
+ SD_BUS_NO_ARGS,
+ SD_BUS_ARGS("a(suuutuusu)", entries),
+ bus_service_method_dump_file_descriptor_store,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ /* The following four are obsolete, and thus marked hidden here. They moved into the Unit interface */
+ SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitAction", "s", property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("FailureAction", "s", property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_VTABLE_END
+};
+
+static int bus_set_transient_exit_status(
+ Unit *u,
+ const char *name,
+ ExitStatusSet *status_set,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const int32_t *status, *signal;
+ size_t n_status, n_signal, i;
+ int r;
+
+ r = sd_bus_message_enter_container(message, 'r', "aiai");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'i', (const void **) &status, &n_status);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'i', (const void **) &signal, &n_signal);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ n_status /= sizeof(int32_t);
+ n_signal /= sizeof(int32_t);
+
+ if (n_status == 0 && n_signal == 0 && !UNIT_WRITE_FLAGS_NOOP(flags)) {
+ exit_status_set_free(status_set);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ return 1;
+ }
+
+ for (i = 0; i < n_status; i++) {
+ if (status[i] < 0 || status[i] > 255)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid status code in %s: %"PRIi32, name, status[i]);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bitmap_set(&status_set->status, status[i]);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%"PRIi32, name, status[i]);
+ }
+ }
+
+ for (i = 0; i < n_signal; i++) {
+ const char *str;
+
+ str = signal_to_string((int) signal[i]);
+ if (!str)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid signal in %s: %"PRIi32, name, signal[i]);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = bitmap_set(&status_set->signal, signal[i]);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, str);
+ }
+ }
+
+ return 1;
+}
+
+static int bus_set_transient_std_fd(
+ Unit *u,
+ const char *name,
+ int *p,
+ bool *b,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int fd, r;
+
+ assert(p);
+ assert(b);
+
+ r = sd_bus_message_read(message, "h", &fd);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ int copy;
+
+ copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (copy < 0)
+ return -errno;
+
+ asynchronous_close(*p);
+ *p = copy;
+ *b = true;
+ }
+
+ return 1;
+}
+static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_exit_type, ServiceExitType, service_exit_type_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart_mode, ServiceRestartMode, service_restart_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, sd_bus_service_name_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(timeout_failure_mode, ServiceTimeoutFailureMode, service_timeout_failure_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(reload_signal, "i", int32_t, int, "%" PRIi32, signal_to_string_with_check);
+
+static int bus_service_set_transient_property(
+ Service *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(s);
+ ServiceExecCommand ci;
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "PermissionsStartOnly"))
+ return bus_set_transient_bool(u, name, &s->permissions_start_only, message, flags, error);
+
+ if (streq(name, "RootDirectoryStartOnly"))
+ return bus_set_transient_bool(u, name, &s->root_directory_start_only, message, flags, error);
+
+ if (streq(name, "RemainAfterExit"))
+ return bus_set_transient_bool(u, name, &s->remain_after_exit, message, flags, error);
+
+ if (streq(name, "GuessMainPID"))
+ return bus_set_transient_bool(u, name, &s->guess_main_pid, message, flags, error);
+
+ if (streq(name, "Type"))
+ return bus_set_transient_service_type(u, name, &s->type, message, flags, error);
+
+ if (streq(name, "ExitType"))
+ return bus_set_transient_service_exit_type(u, name, &s->exit_type, message, flags, error);
+
+ if (streq(name, "OOMPolicy"))
+ return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
+ if (streq(name, "RestartUSec"))
+ return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error);
+
+ if (streq(name, "RestartSteps"))
+ return bus_set_transient_unsigned(u, name, &s->restart_steps, message, flags, error);
+
+ if (streq(name, "RestartMaxDelayUSec"))
+ return bus_set_transient_usec(u, name, &s->restart_max_delay_usec, message, flags, error);
+
+ if (streq(name, "TimeoutStartUSec")) {
+ r = bus_set_transient_usec(u, name, &s->timeout_start_usec, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ s->start_timeout_defined = true;
+
+ return r;
+ }
+
+ if (streq(name, "TimeoutStopUSec"))
+ return bus_set_transient_usec(u, name, &s->timeout_stop_usec, message, flags, error);
+
+ if (streq(name, "TimeoutAbortUSec")) {
+ r = bus_set_transient_usec(u, name, &s->timeout_abort_usec, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ s->timeout_abort_set = true;
+ return r;
+ }
+
+ if (streq(name, "TimeoutStartFailureMode"))
+ return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_start_failure_mode, message, flags, error);
+
+ if (streq(name, "TimeoutStopFailureMode"))
+ return bus_set_transient_timeout_failure_mode(u, name, &s->timeout_stop_failure_mode, message, flags, error);
+
+ if (streq(name, "RuntimeMaxUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_max_usec, message, flags, error);
+
+ if (streq(name, "RuntimeRandomizedExtraUSec"))
+ return bus_set_transient_usec(u, name, &s->runtime_rand_extra_usec, message, flags, error);
+
+ if (streq(name, "WatchdogUSec"))
+ return bus_set_transient_usec(u, name, &s->watchdog_usec, message, flags, error);
+
+ if (streq(name, "FileDescriptorStoreMax"))
+ return bus_set_transient_unsigned(u, name, &s->n_fd_store_max, message, flags, error);
+
+ if (streq(name, "FileDescriptorStorePreserve"))
+ return bus_set_transient_exec_preserve_mode(u, name, &s->fd_store_preserve_mode, message, flags, error);
+
+ if (streq(name, "NotifyAccess"))
+ return bus_set_transient_notify_access(u, name, &s->notify_access, message, flags, error);
+
+ if (streq(name, "PIDFile")) {
+ _cleanup_free_ char *n = NULL;
+ const char *v, *e;
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ if (!isempty(v)) {
+ n = path_make_absolute(v, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+ if (!n)
+ return -ENOMEM;
+
+ path_simplify(n);
+
+ if (!path_is_normalized(n))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PIDFile= path '%s' is not valid", n);
+
+ e = path_startswith(n, "/var/run/");
+ if (e) {
+ char *z;
+
+ z = path_join("/run", e);
+ if (!z)
+ return log_oom();
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ log_unit_notice(u, "Transient unit's PIDFile= property references path below legacy directory /var/run, updating %s %s %s; please update client accordingly.",
+ n, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), z);
+
+ free_and_replace(n, z);
+ }
+ }
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ free_and_replace(s->pid_file, n);
+ unit_write_settingf(u, flags, name, "%s=%s", name, strempty(s->pid_file));
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "USBFunctionDescriptors"))
+ return bus_set_transient_path(u, name, &s->usb_function_descriptors, message, flags, error);
+
+ if (streq(name, "USBFunctionStrings"))
+ return bus_set_transient_path(u, name, &s->usb_function_strings, message, flags, error);
+
+ if (streq(name, "BusName"))
+ return bus_set_transient_bus_name(u, name, &s->bus_name, message, flags, error);
+
+ if (streq(name, "Restart"))
+ return bus_set_transient_service_restart(u, name, &s->restart, message, flags, error);
+
+ if (streq(name, "RestartMode"))
+ return bus_set_transient_service_restart_mode(u, name, &s->restart_mode, message, flags, error);
+
+ if (streq(name, "RestartPreventExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->restart_prevent_status, message, flags, error);
+
+ if (streq(name, "RestartForceExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->restart_force_status, message, flags, error);
+
+ if (streq(name, "SuccessExitStatus"))
+ return bus_set_transient_exit_status(u, name, &s->success_status, message, flags, error);
+
+ ci = service_exec_command_from_string(name);
+ if (ci < 0)
+ ci = service_exec_ex_command_from_string(name);
+ if (ci >= 0)
+ return bus_set_transient_exec_command(u, name, &s->exec_command[ci], message, flags, error);
+
+ if (streq(name, "StandardInputFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stdin_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ if (streq(name, "StandardOutputFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stdout_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ if (streq(name, "StandardErrorFileDescriptor"))
+ return bus_set_transient_std_fd(u, name, &s->stderr_fd, &s->exec_context.stdio_as_fds, message, flags, error);
+
+ if (streq(name, "OpenFile")) {
+ const char *path, *fdname;
+ uint64_t offlags;
+
+ r = sd_bus_message_enter_container(message, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(sst)", &path, &fdname, &offlags)) > 0) {
+ _cleanup_(open_file_freep) OpenFile *of = NULL;
+ _cleanup_free_ char *ofs = NULL;
+
+ of = new(OpenFile, 1);
+ if (!of)
+ return -ENOMEM;
+
+ *of = (OpenFile) {
+ .path = strdup(path),
+ .fdname = strdup(fdname),
+ .flags = offlags,
+ };
+
+ if (!of->path || !of->fdname)
+ return -ENOMEM;
+
+ r = open_file_validate(of);
+ if (r < 0)
+ return r;
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ continue;
+
+ r = open_file_to_string(of, &ofs);
+ if (r < 0)
+ return sd_bus_error_set_errnof(
+ error, r, "Failed to convert OpenFile= value to string: %m");
+
+ LIST_APPEND(open_files, s->open_files, TAKE_PTR(of));
+ unit_write_settingf(u, flags | UNIT_ESCAPE_SPECIFIERS, name, "OpenFile=%s", ofs);
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 1;
+ }
+
+ if (streq(name, "ReloadSignal"))
+ return bus_set_transient_reload_signal(u, name, &s->reload_signal, message, flags, error);
+
+ return 0;
+}
+
+int bus_service_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's allow a little more */
+
+ r = bus_service_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_service_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-service.h b/src/core/dbus-service.h
new file mode 100644
index 0000000..aea6cf7
--- /dev/null
+++ b/src/core/dbus-service.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_service_vtable[];
+
+int bus_service_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
+int bus_service_method_bind_mount(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_method_mount_image(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_service_commit_properties(Unit *u);
+int bus_service_method_dump_file_descriptor_store(sd_bus_message *message, void *userdata, sd_bus_error *error);
diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c
new file mode 100644
index 0000000..de41d65
--- /dev/null
+++ b/src/core/dbus-slice.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-cgroup.h"
+#include "dbus-slice.h"
+#include "slice.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_slice_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_VTABLE_END
+};
+
+int bus_slice_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Slice *s = SLICE(u);
+
+ assert(name);
+ assert(u);
+
+ return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_slice_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-slice.h b/src/core/dbus-slice.h
new file mode 100644
index 0000000..eb71916
--- /dev/null
+++ b/src/core/dbus-slice.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_slice_vtable[];
+
+int bus_slice_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_slice_commit_properties(Unit *u);
diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c
new file mode 100644
index 0000000..e77e9e5
--- /dev/null
+++ b/src/core/dbus-socket.c
@@ -0,0 +1,470 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-kill.h"
+#include "dbus-socket.h"
+#include "dbus-util.h"
+#include "fd-util.h"
+#include "ip-protocol-list.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "socket-util.h"
+#include "string-util.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, socket_result, SocketResult);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_bind_ipv6_only, socket_address_bind_ipv6_only, SocketAddressBindIPv6Only);
+static BUS_DEFINE_PROPERTY_GET(property_get_fdname, "s", Socket, socket_fdname);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_timestamping, socket_timestamping, SocketTimestamping);
+
+static int property_get_listen(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Socket *s = SOCKET(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+ assert(s);
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(port, p, s->ports) {
+ _cleanup_free_ char *address = NULL;
+
+ r = socket_port_to_address(p, &address);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "(ss)", socket_port_type_to_string(p), address);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_socket_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("BindIPv6Only", "s", property_get_bind_ipv6_only, offsetof(Socket, bind_ipv6_only), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Backlog", "u", bus_property_get_unsigned, offsetof(Socket, backlog), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Socket, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindToDevice", "s", NULL, offsetof(Socket, bind_to_device), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketUser", "s", NULL, offsetof(Socket, user), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketGroup", "s", NULL, offsetof(Socket, group), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SocketMode", "u", bus_property_get_mode, offsetof(Socket, socket_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Socket, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Accept", "b", bus_property_get_bool, offsetof(Socket, accept), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FlushPending", "b", bus_property_get_bool, offsetof(Socket, flush_pending), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Writable", "b", bus_property_get_bool, offsetof(Socket, writable), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAlive", "b", bus_property_get_bool, offsetof(Socket, keep_alive), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveTimeUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_time), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, keep_alive_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("KeepAliveProbes", "u", bus_property_get_unsigned, offsetof(Socket, keep_alive_cnt), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DeferAcceptUSec" , "t", bus_property_get_usec, offsetof(Socket, defer_accept), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NoDelay", "b", bus_property_get_bool, offsetof(Socket, no_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Priority", "i", bus_property_get_int, offsetof(Socket, priority), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReceiveBuffer", "t", bus_property_get_size, offsetof(Socket, receive_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SendBuffer", "t", bus_property_get_size, offsetof(Socket, send_buffer), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPTOS", "i", bus_property_get_int, offsetof(Socket, ip_tos), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IPTTL", "i", bus_property_get_int, offsetof(Socket, ip_ttl), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PipeSize", "t", bus_property_get_size, offsetof(Socket, pipe_size), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FreeBind", "b", bus_property_get_bool, offsetof(Socket, free_bind), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Transparent", "b", bus_property_get_bool, offsetof(Socket, transparent), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Broadcast", "b", bus_property_get_bool, offsetof(Socket, broadcast), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassCredentials", "b", bus_property_get_bool, offsetof(Socket, pass_cred), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassSecurity", "b", bus_property_get_bool, offsetof(Socket, pass_sec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PassPacketInfo", "b", bus_property_get_bool, offsetof(Socket, pass_pktinfo), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Timestamping", "s", property_get_timestamping, offsetof(Socket, timestamping), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemoveOnStop", "b", bus_property_get_bool, offsetof(Socket, remove_on_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Listen", "a(ss)", property_get_listen, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Symlinks", "as", NULL, offsetof(Socket, symlinks), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Mark", "i", bus_property_get_int, offsetof(Socket, mark), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MaxConnections", "u", bus_property_get_unsigned, offsetof(Socket, max_connections), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MaxConnectionsPerSource", "u", bus_property_get_unsigned, offsetof(Socket, max_connections_per_source), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MessageQueueMaxMessages", "x", bus_property_get_long, offsetof(Socket, mq_maxmsg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MessageQueueMessageSize", "x", bus_property_get_long, offsetof(Socket, mq_msgsize), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TCPCongestion", "s", NULL, offsetof(Socket, tcp_congestion), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReusePort", "b", bus_property_get_bool, offsetof(Socket, reuse_port), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabel", "s", NULL, offsetof(Socket, smack), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabelIPIn", "s", NULL, offsetof(Socket, smack_ip_in), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SmackLabelIPOut", "s", NULL, offsetof(Socket, smack_ip_out), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Socket, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Socket, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NConnections", "u", bus_property_get_unsigned, offsetof(Socket, n_connections), 0),
+ SD_BUS_PROPERTY("NAccepted", "u", bus_property_get_unsigned, offsetof(Socket, n_accepted), 0),
+ SD_BUS_PROPERTY("NRefused", "u", bus_property_get_unsigned, offsetof(Socket, n_refused), 0),
+ SD_BUS_PROPERTY("FileDescriptorName", "s", property_get_fdname, 0, 0),
+ SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PollLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, poll_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PollLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, poll_limit_burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPost", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+static bool check_size_t_truncation(uint64_t t) {
+ return (size_t) t == t;
+}
+
+static const char* socket_protocol_to_string(int32_t i) {
+ if (i == IPPROTO_IP)
+ return "";
+
+ if (!IN_SET(i, IPPROTO_UDPLITE, IPPROTO_SCTP))
+ return NULL;
+
+ return ip_protocol_to_name(i);
+}
+
+static BUS_DEFINE_SET_TRANSIENT(int, "i", int32_t, int, "%" PRIi32);
+static BUS_DEFINE_SET_TRANSIENT(message_queue, "x", int64_t, long, "%" PRIi64);
+static BUS_DEFINE_SET_TRANSIENT_IS_VALID(size_t_check_truncation, "t", uint64_t, size_t, "%" PRIu64, check_size_t_truncation);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(bind_ipv6_only, SocketAddressBindIPv6Only, socket_address_bind_ipv6_only_or_bool_from_string);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(fdname, fdname_is_valid);
+static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(ifname, ifname_valid);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(ip_tos, "i", int32_t, int, "%" PRIi32, ip_tos_to_string_alloc);
+static BUS_DEFINE_SET_TRANSIENT_TO_STRING(socket_protocol, "i", int32_t, int, "%" PRIi32, socket_protocol_to_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(socket_timestamping, SocketTimestamping, socket_timestamping_from_string_harder);
+
+static int bus_socket_set_transient_property(
+ Socket *s,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ SocketExecCommand ci;
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "Accept"))
+ return bus_set_transient_bool(u, name, &s->accept, message, flags, error);
+
+ if (streq(name, "FlushPending"))
+ return bus_set_transient_bool(u, name, &s->flush_pending, message, flags, error);
+
+ if (streq(name, "Writable"))
+ return bus_set_transient_bool(u, name, &s->writable, message, flags, error);
+
+ if (streq(name, "KeepAlive"))
+ return bus_set_transient_bool(u, name, &s->keep_alive, message, flags, error);
+
+ if (streq(name, "NoDelay"))
+ return bus_set_transient_bool(u, name, &s->no_delay, message, flags, error);
+
+ if (streq(name, "FreeBind"))
+ return bus_set_transient_bool(u, name, &s->free_bind, message, flags, error);
+
+ if (streq(name, "Transparent"))
+ return bus_set_transient_bool(u, name, &s->transparent, message, flags, error);
+
+ if (streq(name, "Broadcast"))
+ return bus_set_transient_bool(u, name, &s->broadcast, message, flags, error);
+
+ if (streq(name, "PassCredentials"))
+ return bus_set_transient_bool(u, name, &s->pass_cred, message, flags, error);
+
+ if (streq(name, "PassSecurity"))
+ return bus_set_transient_bool(u, name, &s->pass_sec, message, flags, error);
+
+ if (streq(name, "PassPacketInfo"))
+ return bus_set_transient_bool(u, name, &s->pass_pktinfo, message, flags, error);
+
+ if (streq(name, "Timestamping"))
+ return bus_set_transient_socket_timestamping(u, name, &s->timestamping, message, flags, error);
+
+ if (streq(name, "ReusePort"))
+ return bus_set_transient_bool(u, name, &s->reuse_port, message, flags, error);
+
+ if (streq(name, "RemoveOnStop"))
+ return bus_set_transient_bool(u, name, &s->remove_on_stop, message, flags, error);
+
+ if (streq(name, "SELinuxContextFromNet"))
+ return bus_set_transient_bool(u, name, &s->selinux_context_from_net, message, flags, error);
+
+ if (streq(name, "Priority"))
+ return bus_set_transient_int(u, name, &s->priority, message, flags, error);
+
+ if (streq(name, "IPTTL"))
+ return bus_set_transient_int(u, name, &s->ip_ttl, message, flags, error);
+
+ if (streq(name, "Mark"))
+ return bus_set_transient_int(u, name, &s->mark, message, flags, error);
+
+ if (streq(name, "Backlog"))
+ return bus_set_transient_unsigned(u, name, &s->backlog, message, flags, error);
+
+ if (streq(name, "MaxConnections"))
+ return bus_set_transient_unsigned(u, name, &s->max_connections, message, flags, error);
+
+ if (streq(name, "MaxConnectionsPerSource"))
+ return bus_set_transient_unsigned(u, name, &s->max_connections_per_source, message, flags, error);
+
+ if (streq(name, "KeepAliveProbes"))
+ return bus_set_transient_unsigned(u, name, &s->keep_alive_cnt, message, flags, error);
+
+ if (streq(name, "TriggerLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &s->trigger_limit.burst, message, flags, error);
+
+ if (streq(name, "PollLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &s->poll_limit_burst, message, flags, error);
+
+ if (streq(name, "SocketMode"))
+ return bus_set_transient_mode_t(u, name, &s->socket_mode, message, flags, error);
+
+ if (streq(name, "DirectoryMode"))
+ return bus_set_transient_mode_t(u, name, &s->directory_mode, message, flags, error);
+
+ if (streq(name, "MessageQueueMaxMessages"))
+ return bus_set_transient_message_queue(u, name, &s->mq_maxmsg, message, flags, error);
+
+ if (streq(name, "MessageQueueMessageSize"))
+ return bus_set_transient_message_queue(u, name, &s->mq_msgsize, message, flags, error);
+
+ if (streq(name, "TimeoutUSec"))
+ return bus_set_transient_usec_fix_0(u, name, &s->timeout_usec, message, flags, error);
+
+ if (streq(name, "KeepAliveTimeUSec"))
+ return bus_set_transient_usec(u, name, &s->keep_alive_time, message, flags, error);
+
+ if (streq(name, "KeepAliveIntervalUSec"))
+ return bus_set_transient_usec(u, name, &s->keep_alive_interval, message, flags, error);
+
+ if (streq(name, "DeferAcceptUSec"))
+ return bus_set_transient_usec(u, name, &s->defer_accept, message, flags, error);
+
+ if (streq(name, "TriggerLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &s->trigger_limit.interval, message, flags, error);
+
+ if (streq(name, "PollLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &s->poll_limit_interval, message, flags, error);
+
+ if (streq(name, "SmackLabel"))
+ return bus_set_transient_string(u, name, &s->smack, message, flags, error);
+
+ if (streq(name, "SmackLabelIPin"))
+ return bus_set_transient_string(u, name, &s->smack_ip_in, message, flags, error);
+
+ if (streq(name, "SmackLabelIPOut"))
+ return bus_set_transient_string(u, name, &s->smack_ip_out, message, flags, error);
+
+ if (streq(name, "TCPCongestion"))
+ return bus_set_transient_string(u, name, &s->tcp_congestion, message, flags, error);
+
+ if (streq(name, "FileDescriptorName"))
+ return bus_set_transient_fdname(u, name, &s->fdname, message, flags, error);
+
+ if (streq(name, "SocketUser"))
+ return bus_set_transient_user_relaxed(u, name, &s->user, message, flags, error);
+
+ if (streq(name, "SocketGroup"))
+ return bus_set_transient_user_relaxed(u, name, &s->group, message, flags, error);
+
+ if (streq(name, "BindIPv6Only"))
+ return bus_set_transient_bind_ipv6_only(u, name, &s->bind_ipv6_only, message, flags, error);
+
+ if (streq(name, "ReceiveBuffer"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->receive_buffer, message, flags, error);
+
+ if (streq(name, "SendBuffer"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->send_buffer, message, flags, error);
+
+ if (streq(name, "PipeSize"))
+ return bus_set_transient_size_t_check_truncation(u, name, &s->pipe_size, message, flags, error);
+
+ if (streq(name, "BindToDevice"))
+ return bus_set_transient_ifname(u, name, &s->bind_to_device, message, flags, error);
+
+ if (streq(name, "IPTOS"))
+ return bus_set_transient_ip_tos(u, name, &s->ip_tos, message, flags, error);
+
+ if (streq(name, "SocketProtocol"))
+ return bus_set_transient_socket_protocol(u, name, &s->socket_protocol, message, flags, error);
+
+ ci = socket_exec_command_from_string(name);
+ if (ci >= 0)
+ return bus_set_transient_exec_command(u, name,
+ &s->exec_command[ci],
+ message, flags, error);
+
+ if (streq(name, "Symlinks")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Symlink path is not absolute: %s", *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ s->symlinks = strv_free(s->symlinks);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=", name);
+ } else {
+ _cleanup_free_ char *joined = NULL;
+
+ r = strv_extend_strv(&s->symlinks, l, true);
+ if (r < 0)
+ return -ENOMEM;
+
+ joined = strv_join(l, " ");
+ if (!joined)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "%s=%s", name, joined);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Listen")) {
+ const char *t, *a;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &t, &a)) > 0) {
+ _cleanup_(socket_port_freep) SocketPort *p = NULL;
+
+ p = new(SocketPort, 1);
+ if (!p)
+ return log_oom();
+
+ *p = (SocketPort) {
+ .fd = -EBADF,
+ .socket = s,
+ };
+
+ p->type = socket_port_type_from_string(t);
+ if (p->type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown Socket type: %s", t);
+
+ if (p->type != SOCKET_SOCKET) {
+ if (!path_is_absolute(a) || !path_is_valid(a))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid socket path: %s", a);
+
+ r = path_simplify_alloc(a, &p->path);
+ if (r < 0)
+ return r;
+
+ } else if (streq(t, "Netlink")) {
+ r = socket_address_parse_netlink(&p->address, a);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid netlink address: %s", a);
+
+ } else {
+ r = socket_address_parse(&p->address, a);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address: %s", a);
+
+ p->address.type = socket_address_type_from_string(t);
+ if (p->address.type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address type: %s", t);
+
+ if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Address family not supported: %s", a);
+ }
+
+ empty = false;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ LIST_APPEND(port, s->ports, TAKE_PTR(p));
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Listen%s=%s", t, a);
+ }
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ socket_free_ports(s);
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "ListenStream=");
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_socket_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ r = bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ if (u->transient && u->load_state == UNIT_STUB) {
+ /* This is a transient unit, let's load a little more */
+
+ r = bus_socket_set_transient_property(s, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_exec_context_set_transient_property(u, &s->exec_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+
+ r = bus_kill_context_set_transient_property(u, &s->kill_context, name, message, flags, error);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_socket_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-socket.h b/src/core/dbus-socket.h
new file mode 100644
index 0000000..f9f36a2
--- /dev/null
+++ b/src/core/dbus-socket.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_socket_vtable[];
+
+int bus_socket_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_socket_commit_properties(Unit *u);
diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c
new file mode 100644
index 0000000..7230352
--- /dev/null
+++ b/src/core/dbus-swap.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "bus-get-properties.h"
+#include "dbus-cgroup.h"
+#include "dbus-execute.h"
+#include "dbus-swap.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET(property_get_priority, "i", Swap, swap_get_priority);
+static BUS_DEFINE_PROPERTY_GET(property_get_options, "s", Swap, swap_get_options);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, swap_result, SwapResult);
+
+const sd_bus_vtable bus_swap_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("What", "s", NULL, offsetof(Swap, what), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Priority", "i", property_get_priority, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Options", "s", property_get_options, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid.pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_VTABLE_END
+};
+
+int bus_swap_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(name);
+ assert(message);
+
+ return bus_cgroup_set_property(u, &s->cgroup_context, name, message, flags, error);
+}
+
+int bus_swap_commit_properties(Unit *u) {
+ assert(u);
+
+ unit_realize_cgroup(u);
+
+ return 0;
+}
diff --git a/src/core/dbus-swap.h b/src/core/dbus-swap.h
new file mode 100644
index 0000000..9d651b5
--- /dev/null
+++ b/src/core/dbus-swap.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_swap_vtable[];
+
+int bus_swap_set_property(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_swap_commit_properties(Unit *u);
diff --git a/src/core/dbus-target.c b/src/core/dbus-target.c
new file mode 100644
index 0000000..e979fb7
--- /dev/null
+++ b/src/core/dbus-target.c
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "unit.h"
+
+const sd_bus_vtable bus_target_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_VTABLE_END
+};
diff --git a/src/core/dbus-target.h b/src/core/dbus-target.h
new file mode 100644
index 0000000..fedd4a9
--- /dev/null
+++ b/src/core/dbus-target.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus-vtable.h"
+
+extern const sd_bus_vtable bus_target_vtable[];
diff --git a/src/core/dbus-timer.c b/src/core/dbus-timer.c
new file mode 100644
index 0000000..4f78a52
--- /dev/null
+++ b/src/core/dbus-timer.c
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "dbus-timer.h"
+#include "dbus-util.h"
+#include "strv.h"
+#include "timer.h"
+#include "unit.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, timer_result, TimerResult);
+
+static int property_get_monotonic_timers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(stt)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ _cleanup_free_ char *usec = NULL;
+
+ if (v->base == TIMER_CALENDAR)
+ continue;
+
+ usec = timer_base_to_usec_string(v->base);
+ if (!usec)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "(stt)", usec, v->value, v->next_elapse);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_calendar_timers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "(sst)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ _cleanup_free_ char *buf = NULL;
+
+ if (v->base != TIMER_CALENDAR)
+ continue;
+
+ r = calendar_spec_to_string(v->calendar_spec, &buf);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "(sst)", timer_base_to_string(v->base), buf, v->next_elapse);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_next_elapse_monotonic(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Timer *t = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "t", timer_next_elapse_monotonic(t));
+}
+
+const sd_bus_vtable bus_timer_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Unit", "s", bus_property_get_triggered_unit, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TimersMonotonic", "a(stt)", property_get_monotonic_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("TimersCalendar", "a(sst)", property_get_calendar_timers, 0, SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("OnClockChange", "b", bus_property_get_bool, offsetof(Timer, on_clock_change), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnTimezoneChange", "b", bus_property_get_bool, offsetof(Timer, on_timezone_change), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NextElapseUSecRealtime", "t", bus_property_get_usec, offsetof(Timer, next_elapse_realtime), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("NextElapseUSecMonotonic", "t", property_get_next_elapse_monotonic, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("LastTriggerUSec", offsetof(Timer, last_trigger), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Timer, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("AccuracyUSec", "t", bus_property_get_usec, offsetof(Timer, accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RandomizedDelayUSec", "t", bus_property_get_usec, offsetof(Timer, random_usec), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FixedRandomDelay", "b", bus_property_get_bool, offsetof(Timer, fixed_random_delay), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Persistent", "b", bus_property_get_bool, offsetof(Timer, persistent), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WakeSystem", "b", bus_property_get_bool, offsetof(Timer, wake_system), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RemainAfterElapse", "b", bus_property_get_bool, offsetof(Timer, remain_after_elapse), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_VTABLE_END
+};
+
+static int timer_add_one_monotonic_spec(
+ Timer *t,
+ const char *name,
+ TimerBase base,
+ UnitWriteFlags flags,
+ usec_t usec,
+ sd_bus_error *error) {
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ TimerValue *v;
+
+ unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s",
+ timer_base_to_string(base),
+ FORMAT_TIMESPAN(usec, USEC_PER_MSEC));
+
+ v = new(TimerValue, 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (TimerValue) {
+ .base = base,
+ .value = usec,
+ };
+
+ LIST_PREPEND(value, t->values, v);
+ }
+
+ return 1;
+}
+
+static int timer_add_one_calendar_spec(
+ Timer *t,
+ const char *name,
+ TimerBase base,
+ UnitWriteFlags flags,
+ const char *str,
+ sd_bus_error *error) {
+
+ _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+ int r;
+
+ r = calendar_spec_from_string(str, &c);
+ if (r == -EINVAL)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid calendar spec");
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ unit_write_settingf(UNIT(t), flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s", timer_base_to_string(base), str);
+
+ TimerValue *v = new(TimerValue, 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (TimerValue) {
+ .base = base,
+ .calendar_spec = TAKE_PTR(c),
+ };
+
+ LIST_PREPEND(value, t->values, v);
+ }
+
+ return 1;
+};
+
+static int bus_timer_set_transient_property(
+ Timer *t,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ Unit *u = UNIT(t);
+ int r;
+
+ assert(t);
+ assert(name);
+ assert(message);
+
+ flags |= UNIT_PRIVATE;
+
+ if (streq(name, "AccuracyUSec"))
+ return bus_set_transient_usec(u, name, &t->accuracy_usec, message, flags, error);
+
+ if (streq(name, "AccuracySec")) {
+ log_notice("Client is using obsolete AccuracySec= transient property, please use AccuracyUSec= instead.");
+ return bus_set_transient_usec(u, "AccuracyUSec", &t->accuracy_usec, message, flags, error);
+ }
+
+ if (streq(name, "RandomizedDelayUSec"))
+ return bus_set_transient_usec(u, name, &t->random_usec, message, flags, error);
+
+ if (streq(name, "FixedRandomDelay"))
+ return bus_set_transient_bool(u, name, &t->fixed_random_delay, message, flags, error);
+
+ if (streq(name, "WakeSystem"))
+ return bus_set_transient_bool(u, name, &t->wake_system, message, flags, error);
+
+ if (streq(name, "Persistent"))
+ return bus_set_transient_bool(u, name, &t->persistent, message, flags, error);
+
+ if (streq(name, "RemainAfterElapse"))
+ return bus_set_transient_bool(u, name, &t->remain_after_elapse, message, flags, error);
+
+ if (streq(name, "OnTimezoneChange"))
+ return bus_set_transient_bool(u, name, &t->on_timezone_change, message, flags, error);
+
+ if (streq(name, "OnClockChange"))
+ return bus_set_transient_bool(u, name, &t->on_clock_change, message, flags, error);
+
+ if (streq(name, "TimersMonotonic")) {
+ const char *base_name;
+ usec_t usec;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(st)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(st)", &base_name, &usec)) > 0) {
+ TimerBase b;
+
+ b = timer_base_from_string(base_name);
+ if (b < 0 || b == TIMER_CALENDAR)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid timer base: %s", base_name);
+
+ r = timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+ if (r < 0)
+ return r;
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ timer_free_values(t);
+ unit_write_setting(u, flags, name, "OnActiveSec=");
+ }
+
+ return 1;
+
+ } else if (streq(name, "TimersCalendar")) {
+ const char *base_name, *str;
+ bool empty = true;
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &base_name, &str)) > 0) {
+ TimerBase b;
+
+ b = timer_base_from_string(base_name);
+ if (b != TIMER_CALENDAR)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid timer base: %s", base_name);
+
+ r = timer_add_one_calendar_spec(t, name, b, flags, str, error);
+ if (r < 0)
+ return r;
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ timer_free_values(t);
+ unit_write_setting(u, flags, name, "OnCalendar=");
+ }
+
+ return 1;
+
+ } else if (STR_IN_SET(name,
+ "OnActiveSec",
+ "OnBootSec",
+ "OnStartupSec",
+ "OnUnitActiveSec",
+ "OnUnitInactiveSec")) {
+
+ TimerBase b;
+ usec_t usec;
+
+ log_notice("Client is using obsolete %s= transient property, please use TimersMonotonic= instead.", name);
+
+ b = timer_base_from_string(name);
+ if (b < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown timer base %s", name);
+
+ r = sd_bus_message_read(message, "t", &usec);
+ if (r < 0)
+ return r;
+
+ return timer_add_one_monotonic_spec(t, name, b, flags, usec, error);
+
+ } else if (streq(name, "OnCalendar")) {
+
+ const char *str;
+
+ log_notice("Client is using obsolete %s= transient property, please use TimersCalendar= instead.", name);
+
+ r = sd_bus_message_read(message, "s", &str);
+ if (r < 0)
+ return r;
+
+ return timer_add_one_calendar_spec(t, name, TIMER_CALENDAR, flags, str, error);
+ }
+
+ return 0;
+}
+
+int bus_timer_set_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags mode,
+ sd_bus_error *error) {
+
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(name);
+ assert(message);
+
+ if (u->transient && u->load_state == UNIT_STUB)
+ return bus_timer_set_transient_property(t, name, message, mode, error);
+
+ return 0;
+}
diff --git a/src/core/dbus-timer.h b/src/core/dbus-timer.h
new file mode 100644
index 0000000..ac436f1
--- /dev/null
+++ b/src/core/dbus-timer.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-bus-vtable.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_timer_vtable[];
+
+int bus_timer_set_property(Unit *u, const char *name, sd_bus_message *i, UnitWriteFlags flags, sd_bus_error *error);
diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c
new file mode 100644
index 0000000..1a037b7
--- /dev/null
+++ b/src/core/dbus-unit.c
@@ -0,0 +1,2629 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-common-errors.h"
+#include "bus-get-properties.h"
+#include "bus-polkit.h"
+#include "cgroup-util.h"
+#include "condition.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus-util.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "install.h"
+#include "locale-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "service.h"
+#include "signal-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "web-util.h"
+
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_collect_mode, collect_mode, CollectMode);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_load_state, unit_load_state, UnitLoadState);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_job_mode, job_mode, JobMode);
+static BUS_DEFINE_PROPERTY_GET(property_get_description, "s", Unit, unit_description);
+static BUS_DEFINE_PROPERTY_GET2(property_get_active_state, "s", Unit, unit_active_state, unit_active_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_freezer_state, "s", Unit, unit_freezer_state, freezer_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_sub_state, "s", Unit, unit_sub_state_to_string);
+static BUS_DEFINE_PROPERTY_GET2(property_get_unit_file_state, "s", Unit, unit_get_unit_file_state, unit_file_state_to_string);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_reload, "b", Unit, unit_can_reload);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_start, "b", Unit, unit_can_start_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_stop, "b", Unit, unit_can_stop_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_isolate, "b", Unit, unit_can_isolate_refuse_manual);
+static BUS_DEFINE_PROPERTY_GET(property_get_can_freeze, "b", Unit, unit_can_freeze);
+static BUS_DEFINE_PROPERTY_GET(property_get_need_daemon_reload, "b", Unit, unit_need_daemon_reload);
+static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_strv, "as", 0);
+
+static int property_get_can_clean(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata;
+ ExecCleanMask mask;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_can_clean(u, &mask);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!FLAGS_SET(mask, 1U << t))
+ continue;
+
+ r = sd_bus_message_append(reply, "s", exec_resource_type_to_string(t));
+ if (r < 0)
+ return r;
+ }
+
+ if (FLAGS_SET(mask, EXEC_CLEAN_FDSTORE)) {
+ r = sd_bus_message_append(reply, "s", "fdstore");
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_names(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(reply, "s", u->id);
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(t, u->aliases) {
+ r = sd_bus_message_append(reply, "s", t);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_following(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *f;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ f = unit_following(u);
+ return sd_bus_message_append(reply, "s", f ? f->id : NULL);
+}
+
+static int property_get_dependencies(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *other;
+ UnitDependency d;
+ Hashmap *deps;
+ void *v;
+ int r;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ d = unit_dependency_from_string(property);
+ assert_se(d >= 0);
+
+ deps = unit_get_dependencies(u, d);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(v, other, deps) {
+ r = sd_bus_message_append(reply, "s", other->id);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_requires_mounts_for(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Hashmap **h = ASSERT_PTR(userdata);
+ const char *p;
+ void *v;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(v, p, *h) {
+ r = sd_bus_message_append(reply, "s", p);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_unit_file_preset(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_unit_file_preset(u);
+
+ return sd_bus_message_append(reply, "s", preset_action_past_tense_to_string(r));
+}
+
+static int property_get_job(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *p = NULL;
+ Job **j = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ if (!*j)
+ return sd_bus_message_append(reply, "(uo)", 0, "/");
+
+ p = job_dbus_path(*j);
+ if (!p)
+ return -ENOMEM;
+
+ return sd_bus_message_append(reply, "(uo)", (*j)->id, p);
+}
+
+static int property_get_conditions(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const char *(*to_string)(ConditionType type) = NULL;
+ Condition **list = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ to_string = streq(property, "Asserts") ? assert_type_to_string : condition_type_to_string;
+
+ r = sd_bus_message_open_container(reply, 'a', "(sbbsi)");
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(conditions, c, *list) {
+ int tristate;
+
+ tristate =
+ c->result == CONDITION_UNTESTED ? 0 :
+ c->result == CONDITION_SUCCEEDED ? 1 : -1;
+
+ r = sd_bus_message_append(reply, "(sbbsi)",
+ to_string(c->type),
+ c->trigger, c->negate,
+ c->parameter, tristate);
+ if (r < 0)
+ return r;
+
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static int property_get_load_error(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = bus_unit_validate_load_state(u, &e);
+ if (r < 0)
+ return sd_bus_message_append(reply, "(ss)", e.name, e.message);
+
+ return sd_bus_message_append(reply, "(ss)", NULL, NULL);
+}
+
+static int property_get_markers(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ unsigned *markers = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ /* Make sure out values fit in the bitfield. */
+ assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+ for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+ if (FLAGS_SET(*markers, 1u << m)) {
+ r = sd_bus_message_append(reply, "s", unit_marker_to_string(m));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+static const char *const polkit_message_for_job[_JOB_TYPE_MAX] = {
+ [JOB_START] = N_("Authentication is required to start '$(unit)'."),
+ [JOB_STOP] = N_("Authentication is required to stop '$(unit)'."),
+ [JOB_RELOAD] = N_("Authentication is required to reload '$(unit)'."),
+ [JOB_RESTART] = N_("Authentication is required to restart '$(unit)'."),
+ [JOB_TRY_RESTART] = N_("Authentication is required to restart '$(unit)'."),
+};
+
+int bus_unit_method_start_generic(
+ sd_bus_message *message,
+ Unit *u,
+ JobType job_type,
+ bool reload_if_possible,
+ sd_bus_error *error) {
+
+ BusUnitQueueFlags job_flags = reload_if_possible ? BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE : 0;
+ const char *smode, *verb;
+ JobMode mode;
+ int r;
+
+ assert(message);
+ assert(u);
+ assert(job_type >= 0 && job_type < _JOB_TYPE_MAX);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(job_type),
+ error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &smode);
+ if (r < 0)
+ return r;
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+ if (reload_if_possible)
+ verb = strjoina("reload-or-", job_type_to_string(job_type));
+ else
+ verb = job_type_to_string(job_type);
+
+ if (sd_bus_message_is_method_call(message, NULL, "StartUnitWithFlags")) {
+ uint64_t input_flags = 0;
+
+ r = sd_bus_message_read(message, "t", &input_flags);
+ if (r < 0)
+ return r;
+ /* Let clients know that this version doesn't support any flags at the moment. */
+ if (input_flags != 0)
+ return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid 'flags' parameter '%" PRIu64 "'",
+ input_flags);
+ }
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ verb,
+ CAP_SYS_ADMIN,
+ polkit_message_for_job[job_type],
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ return bus_unit_queue_job(message, u, job_type, mode, job_flags, error);
+}
+
+static int bus_unit_method_start(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_START, false, error);
+}
+
+static int bus_unit_method_stop(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_STOP, false, error);
+}
+
+static int bus_unit_method_reload(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RELOAD, false, error);
+}
+
+static int bus_unit_method_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RESTART, false, error);
+}
+
+static int bus_unit_method_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, false, error);
+}
+
+static int bus_unit_method_reload_or_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_RESTART, true, error);
+}
+
+static int bus_unit_method_reload_or_try_restart(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_start_generic(message, userdata, JOB_TRY_RESTART, true, error);
+}
+
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ BusUnitQueueFlags flags = BUS_UNIT_QUEUE_VERBOSE_REPLY;
+ const char *jtype, *smode;
+ Unit *u = ASSERT_PTR(userdata);
+ JobType type;
+ JobMode mode;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "ss", &jtype, &smode);
+ if (r < 0)
+ return r;
+
+ /* Parse the two magic reload types "reload-or-…" manually */
+ if (streq(jtype, "reload-or-restart")) {
+ type = JOB_RESTART;
+ flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ } else if (streq(jtype, "reload-or-try-restart")) {
+ type = JOB_TRY_RESTART;
+ flags |= BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE;
+ } else {
+ /* And the rest generically */
+ type = job_type_from_string(jtype);
+ if (type < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job type %s invalid", jtype);
+ }
+
+ mode = job_mode_from_string(smode);
+ if (mode < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Job mode %s invalid", smode);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(type),
+ error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ jtype,
+ CAP_SYS_ADMIN,
+ polkit_message_for_job[type],
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ return bus_unit_queue_job(message, u, type, mode, flags, error);
+}
+
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int32_t value = 0;
+ const char *swho;
+ int32_t signo;
+ KillWho who;
+ int r, code;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "si", &swho, &signo);
+ if (r < 0)
+ return r;
+
+ if (startswith(sd_bus_message_get_member(message), "QueueSignal")) {
+ r = sd_bus_message_read(message, "i", &value);
+ if (r < 0)
+ return r;
+
+ code = SI_QUEUE;
+ } else
+ code = SI_USER;
+
+ if (isempty(swho))
+ who = KILL_ALL;
+ else {
+ who = kill_who_from_string(swho);
+ if (who < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid who argument: %s", swho);
+ }
+
+ if (!SIGNAL_VALID(signo))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Signal number out of range.");
+
+ if (code == SI_QUEUE && !((signo >= SIGRTMIN) && (signo <= SIGRTMAX)))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Value parameter only accepted for realtime signals (SIGRTMIN…SIGRTMAX), refusing for signal SIG%s.", signal_to_string(signo));
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "kill",
+ CAP_KILL,
+ N_("Authentication is required to send a UNIX signal to the processes of '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_kill(u, who, signo, code, value, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "reload", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "reset-failed",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to reset the \"failed\" state of '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ unit_reset_failed(u);
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int runtime, r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "b", &runtime);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "set-property",
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to set properties on '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = bus_unit_set_properties(u, message, runtime ? UNIT_RUNTIME : UNIT_PERSISTENT, true, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "ref",
+ CAP_SYS_ADMIN,
+ NULL,
+ false,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = bus_unit_track_add_sender(u, message);
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = bus_unit_track_remove_sender(u, message);
+ if (r == -EUNATCH)
+ return sd_bus_error_set(error, BUS_ERROR_NOT_REFERENCED, "Unit has not been referenced yet.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ ExecCleanMask mask = 0;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "stop", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ ExecCleanMask m;
+ const char *i;
+
+ r = sd_bus_message_read(message, "s", &i);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ m = exec_clean_mask_from_string(i);
+ if (m < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid resource type: %s", i);
+
+ mask |= m;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ "clean",
+ CAP_DAC_OVERRIDE,
+ N_("Authentication is required to delete files and directories associated with '$(unit)'."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = unit_clean(u, mask);
+ if (r == -EOPNOTSUPP)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support cleaning.", u->id);
+ if (r == -EUNATCH)
+ return sd_bus_error_set(error, BUS_ERROR_NOTHING_TO_CLEAN, "No matching resources found.");
+ if (r == -EBUSY)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit is not inactive or has pending job.");
+ if (r < 0)
+ return r;
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+static int bus_unit_method_freezer_generic(sd_bus_message *message, void *userdata, sd_bus_error *error, FreezerAction action) {
+ const char* perm;
+ int (*method)(Unit*);
+ Unit *u = ASSERT_PTR(userdata);
+ bool reply_no_delay = false;
+ int r;
+
+ assert(message);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (action == FREEZER_FREEZE) {
+ perm = "stop";
+ method = unit_freeze;
+ } else {
+ perm = "start";
+ method = unit_thaw;
+ }
+
+ r = mac_selinux_unit_access_check(u, message, perm, error);
+ if (r < 0)
+ return r;
+
+ r = bus_verify_manage_units_async_full(
+ u,
+ perm,
+ CAP_SYS_ADMIN,
+ N_("Authentication is required to freeze or thaw the processes of '$(unit)' unit."),
+ true,
+ message,
+ error);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
+
+ r = method(u);
+ if (r == -EOPNOTSUPP)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit '%s' does not support freezing.", u->id);
+ if (r == -EBUSY)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_BUSY, "Unit has a pending job.");
+ if (r == -EHOSTDOWN)
+ return sd_bus_error_set(error, BUS_ERROR_UNIT_INACTIVE, "Unit is inactive.");
+ if (r == -EALREADY)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_FAILED, "Previously requested freezer operation for unit '%s' is still in progress.", u->id);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ reply_no_delay = true;
+
+ if (u->pending_freezer_invocation) {
+ bus_unit_send_pending_freezer_message(u, true);
+ assert(!u->pending_freezer_invocation);
+ }
+
+ u->pending_freezer_invocation = sd_bus_message_ref(message);
+
+ if (reply_no_delay) {
+ r = bus_unit_send_pending_freezer_message(u, false);
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+}
+
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_THAW);
+}
+
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ return bus_unit_method_freezer_generic(message, userdata, error, FREEZER_FREEZE);
+}
+
+static int property_get_refs(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = sd_bus_message_open_container(reply, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (const char *i = sd_bus_track_first(u->bus_track); i; i = sd_bus_track_next(u->bus_track)) {
+ int c;
+
+ c = sd_bus_track_count_name(u->bus_track, i);
+ if (c < 0)
+ return c;
+
+ /* Add the item multiple times if the ref count for each is above 1 */
+ for (int k = 0; k < c; k++) {
+ r = sd_bus_message_append(reply, "s", i);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+const sd_bus_vtable bus_unit_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_PROPERTY("Id", "s", NULL, offsetof(Unit, id), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Names", "as", property_get_names, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Following", "s", property_get_following, 0, 0),
+ SD_BUS_PROPERTY("Requires", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Requisite", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Wants", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BindsTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PartOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Upholds", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequiredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequisiteOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("WantedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("BoundBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UpheldBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConsistsOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Conflicts", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConflictedBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Before", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("After", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccess", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccessOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailure", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailureOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Triggers", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("TriggeredBy", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PropagatesReloadTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ReloadPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PropagatesStopTo", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StopPropagatedFrom", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JoinsNamespaceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SliceOf", "as", property_get_dependencies, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RequiresMountsFor", "as", property_get_requires_mounts_for, offsetof(Unit, requires_mounts_for), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Documentation", "as", NULL, offsetof(Unit, documentation), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Description", "s", property_get_description, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AccessSELinuxContext", "s", NULL, offsetof(Unit, access_selinux_context), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("LoadState", "s", property_get_load_state, offsetof(Unit, load_state), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ActiveState", "s", property_get_active_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("FreezerState", "s", property_get_freezer_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("SubState", "s", property_get_sub_state, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("FragmentPath", "s", NULL, offsetof(Unit, fragment_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SourcePath", "s", NULL, offsetof(Unit, source_path), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DropInPaths", "as", NULL, offsetof(Unit, dropin_paths), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("UnitFileState", "s", property_get_unit_file_state, 0, 0),
+ SD_BUS_PROPERTY("UnitFilePreset", "s", property_get_unit_file_preset, 0, 0),
+ BUS_PROPERTY_DUAL_TIMESTAMP("StateChangeTimestamp", offsetof(Unit, state_change_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InactiveExitTimestamp", offsetof(Unit, inactive_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ActiveEnterTimestamp", offsetof(Unit, active_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ActiveExitTimestamp", offsetof(Unit, active_exit_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("InactiveEnterTimestamp", offsetof(Unit, inactive_enter_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CanStart", "b", property_get_can_start, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanStop", "b", property_get_can_stop, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanReload", "b", property_get_can_reload, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanIsolate", "b", property_get_can_isolate, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanClean", "as", property_get_can_clean, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("CanFreeze", "b", property_get_can_freeze, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Job", "(uo)", property_get_job, offsetof(Unit, job), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("StopWhenUnneeded", "b", bus_property_get_bool, offsetof(Unit, stop_when_unneeded), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RefuseManualStart", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_start), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
+ SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("IgnoreOnIsolate", "b", bus_property_get_bool, offsetof(Unit, ignore_on_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NeedDaemonReload", "b", property_get_need_daemon_reload, 0, 0),
+ SD_BUS_PROPERTY("Markers", "as", property_get_markers, offsetof(Unit, markers), 0),
+ SD_BUS_PROPERTY("JobTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobRunningTimeoutUSec", "t", bus_property_get_usec, offsetof(Unit, job_running_timeout), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobTimeoutAction", "s", bus_property_get_emergency_action, offsetof(Unit, job_timeout_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("JobTimeoutRebootArgument", "s", NULL, offsetof(Unit, job_timeout_reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ConditionResult", "b", bus_property_get_bool, offsetof(Unit, condition_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("AssertResult", "b", bus_property_get_bool, offsetof(Unit, assert_result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("ConditionTimestamp", offsetof(Unit, condition_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ BUS_PROPERTY_DUAL_TIMESTAMP("AssertTimestamp", offsetof(Unit, assert_timestamp), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("Conditions", "a(sbbsi)", property_get_conditions, offsetof(Unit, conditions), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("Asserts", "a(sbbsi)", property_get_conditions, offsetof(Unit, asserts), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
+ SD_BUS_PROPERTY("LoadError", "(ss)", property_get_load_error, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Transient", "b", bus_property_get_bool, offsetof(Unit, transient), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Perpetual", "b", bus_property_get_bool, offsetof(Unit, perpetual), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_ratelimit.burst), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("StartLimitAction", "s", bus_property_get_emergency_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FailureAction", "s", bus_property_get_emergency_action, offsetof(Unit, failure_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("FailureActionExitStatus", "i", bus_property_get_int, offsetof(Unit, failure_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessAction", "s", bus_property_get_emergency_action, offsetof(Unit, success_action), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("SuccessActionExitStatus", "i", bus_property_get_int, offsetof(Unit, success_action_exit_status), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+ SD_BUS_PROPERTY("CollectMode", "s", property_get_collect_mode, offsetof(Unit, collect_mode), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("Refs", "as", property_get_refs, 0, 0),
+ SD_BUS_PROPERTY("ActivationDetails", "a(ss)", bus_property_get_activation_details, offsetof(Unit, activation_details), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+
+ SD_BUS_METHOD_WITH_ARGS("Start",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_start,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Stop",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_stop,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Reload",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Restart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("TryRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_try_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload_or_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("ReloadOrTryRestart",
+ SD_BUS_ARGS("s", mode),
+ SD_BUS_RESULT("o", job),
+ bus_unit_method_reload_or_try_restart,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("EnqueueJob",
+ SD_BUS_ARGS("s", job_type, "s", job_mode),
+ SD_BUS_RESULT("u", job_id, "o", job_path, "s", unit_id, "o", unit_path, "s", job_type, "a(uosos)", affected_jobs),
+ bus_unit_method_enqueue_job,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Kill",
+ SD_BUS_ARGS("s", whom, "i", signal),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_kill,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("QueueSignal",
+ SD_BUS_ARGS("s", whom, "i", signal, "i", value),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_kill,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("ResetFailed",
+ NULL,
+ NULL,
+ bus_unit_method_reset_failed,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("SetProperties",
+ SD_BUS_ARGS("b", runtime, "a(sv)", properties),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_set_properties,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Ref",
+ NULL,
+ NULL,
+ bus_unit_method_ref,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Unref",
+ NULL,
+ NULL,
+ bus_unit_method_unref,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD_WITH_ARGS("Clean",
+ SD_BUS_ARGS("as", mask),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_clean,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Freeze",
+ NULL,
+ NULL,
+ bus_unit_method_freeze,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_METHOD("Thaw",
+ NULL,
+ NULL,
+ bus_unit_method_thaw,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ /* For dependency types we don't support anymore always return an empty array */
+ SD_BUS_PROPERTY("RequiresOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequisiteOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequiredByOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("RequisiteOfOverridable", "as", property_get_empty_strv, 0, SD_BUS_VTABLE_HIDDEN),
+ /* Obsolete alias names */
+ SD_BUS_PROPERTY("StartLimitInterval", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+ SD_BUS_PROPERTY("StartLimitIntervalSec", "t", bus_property_get_usec, offsetof(Unit, start_ratelimit.interval), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN),
+
+ SD_BUS_VTABLE_END
+};
+
+static int property_get_slice(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(reply);
+
+ return sd_bus_message_append(reply, "s", unit_slice_name(u));
+}
+
+static int property_get_current_memory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t sz = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_memory_current(u, &sz);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get current memory usage from cgroup: %m");
+
+ return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_available_memory(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t sz = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_memory_available(u, &sz);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get total available memory from cgroup: %m");
+
+ return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_memory_accounting(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ CGroupMemoryAccountingMetric metric;
+ uint64_t sz = UINT64_MAX;
+
+ assert(bus);
+ assert(reply);
+
+ assert_se((metric = cgroup_memory_accounting_metric_from_string(property)) >= 0);
+ (void) unit_get_memory_accounting(u, metric, &sz);
+ return sd_bus_message_append(reply, "t", sz);
+}
+
+static int property_get_current_tasks(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t cn = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_tasks_current(u, &cn);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get pids.current attribute: %m");
+
+ return sd_bus_message_append(reply, "t", cn);
+}
+
+static int property_get_cpu_usage(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ nsec_t ns = NSEC_INFINITY;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = unit_get_cpu_usage(u, &ns);
+ if (r < 0 && r != -ENODATA)
+ log_unit_warning_errno(u, r, "Failed to get cpuacct.usage attribute: %m");
+
+ return sd_bus_message_append(reply, "t", ns);
+}
+
+static int property_get_cpuset_cpus(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet cpus = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective");
+ (void) cpu_set_to_dbus(&cpus, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cpuset_mems(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ _cleanup_(cpu_set_reset) CPUSet mems = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+
+ (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective");
+ (void) cpu_set_to_dbus(&mems, &array, &allocated);
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_cgroup(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ const char *t = NULL;
+
+ assert(bus);
+ assert(reply);
+
+ /* Three cases: a) u->cgroup_path is NULL, in which case the
+ * unit has no control group, which we report as the empty
+ * string. b) u->cgroup_path is the empty string, which
+ * indicates the root cgroup, which we report as "/". c) all
+ * other cases we report as-is. */
+
+ if (u->cgroup_path)
+ t = empty_to_root(u->cgroup_path);
+
+ return sd_bus_message_append(reply, "s", t);
+}
+
+static int append_process(sd_bus_message *reply, const char *p, PidRef *pid, Set *pids) {
+ _cleanup_free_ char *buf = NULL, *cmdline = NULL;
+ int r;
+
+ assert(reply);
+ assert(pidref_is_set(pid));
+
+ r = set_put(pids, PID_TO_PTR(pid->pid));
+ if (IN_SET(r, 0, -EEXIST))
+ return 0;
+ if (r < 0)
+ return r;
+
+ if (!p) {
+ r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &buf);
+ if (r == -ESRCH)
+ return 0;
+ if (r < 0)
+ return r;
+
+ p = buf;
+ }
+
+ (void) pidref_get_cmdline(
+ pid,
+ SIZE_MAX,
+ PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_QUOTE,
+ &cmdline);
+
+ return sd_bus_message_append(reply,
+ "(sus)",
+ p,
+ (uint32_t) pid->pid,
+ cmdline);
+}
+
+static int append_cgroup(sd_bus_message *reply, const char *p, Set *pids) {
+ _cleanup_closedir_ DIR *d = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(reply);
+ assert(p);
+
+ r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, p, &f);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+ /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels.
+ * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads, “cgroup.procs” in a
+ * threaded domain cgroup contains the PIDs of all processes in the subtree and is not
+ * readable in the subtree proper. */
+
+ r = cg_read_pidref(f, &pidref);
+ if (IN_SET(r, 0, -EOPNOTSUPP))
+ break;
+ if (r < 0)
+ return r;
+
+ r = pidref_is_kernel_thread(&pidref);
+ if (r == -ESRCH) /* gone by now */
+ continue;
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine if " PID_FMT " is a kernel thread, assuming not: %m", pidref.pid);
+ if (r > 0)
+ continue;
+
+ r = append_process(reply, p, &pidref, pids);
+ if (r < 0)
+ return r;
+ }
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, p, &d);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ char *g = NULL, *j = NULL;
+
+ r = cg_read_subgroup(d, &g);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ j = path_join(empty_to_root(p), g);
+ if (!j)
+ return -ENOMEM;
+
+ r = append_cgroup(reply, j, pids);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_set_free_ Set *pids = NULL;
+ Unit *u = userdata;
+ int r;
+
+ assert(message);
+
+ r = mac_selinux_unit_access_check(u, message, "status", error);
+ if (r < 0)
+ return r;
+
+ pids = set_new(NULL);
+ if (!pids)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(sus)");
+ if (r < 0)
+ return r;
+
+ if (u->cgroup_path) {
+ r = append_cgroup(reply, u->cgroup_path, pids);
+ if (r < 0)
+ return r;
+ }
+
+ /* The main and control pids might live outside of the cgroup, hence fetch them separately */
+ PidRef *pid = unit_main_pid(u);
+ if (pidref_is_set(pid)) {
+ r = append_process(reply, NULL, pid, pids);
+ if (r < 0)
+ return r;
+ }
+
+ pid = unit_control_pid(u);
+ if (pidref_is_set(pid)) {
+ r = append_process(reply, NULL, pid, pids);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int property_get_ip_counter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t value = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ CGroupIPAccountingMetric metric;
+
+ assert(bus);
+ assert(reply);
+ assert(property);
+
+ assert_se((metric = cgroup_ip_accounting_metric_from_string(property)) >= 0);
+ (void) unit_get_ip_accounting(u, metric, &value);
+ return sd_bus_message_append(reply, "t", value);
+}
+
+static int property_get_io_counter(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t value = UINT64_MAX;
+ Unit *u = ASSERT_PTR(userdata);
+ ssize_t metric;
+
+ assert(bus);
+ assert(reply);
+ assert(property);
+
+ assert_se((metric = cgroup_io_accounting_metric_from_string(property)) >= 0);
+ (void) unit_get_io_accounting(u, metric, /* allow_cache= */ false, &value);
+ return sd_bus_message_append(reply, "t", value);
+}
+
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ _cleanup_set_free_ Set *pids = NULL;
+ Unit *u = userdata;
+ const char *path;
+ int r;
+
+ assert(message);
+
+ /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a
+ * specified cgroup path. Obviously this only works for units that actually maintain a cgroup
+ * representation. If a process is already in the cgroup no operation is executed – in this case the specified
+ * subcgroup path has no effect! */
+
+ r = mac_selinux_unit_access_check(u, message, "start", error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(message, "s", &path);
+ if (r < 0)
+ return r;
+
+ path = empty_to_null(path);
+ if (path) {
+ if (!path_is_absolute(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
+
+ if (!path_is_normalized(path))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
+ }
+
+ if (!unit_cgroup_delegate(u))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units.");
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing.");
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_enter_container(message, 'a', "u");
+ if (r < 0)
+ return r;
+ for (;;) {
+ _cleanup_(pidref_freep) PidRef *pidref = NULL;
+ uid_t process_uid, sender_uid;
+ uint32_t upid;
+ pid_t pid;
+
+ r = sd_bus_message_read(message, "u", &upid);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (upid == 0) {
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+ } else
+ pid = (uid_t) upid;
+
+ r = pidref_new_from_pid(pid, &pidref);
+ if (r < 0)
+ return r;
+
+ /* Filter out duplicates */
+ if (set_contains(pids, pidref))
+ continue;
+
+ /* Check if this process is suitable for attaching to this unit */
+ r = unit_pid_attachable(u, pidref, error);
+ if (r < 0)
+ return r;
+
+ /* Let's query the sender's UID, so that we can make our security decisions */
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0)
+ return r;
+
+ /* Let's validate security: if the sender is root, then all is OK. If the sender is any other unit,
+ * then the process' UID and the target unit's UID have to match the sender's UID */
+ if (sender_uid != 0 && sender_uid != getuid()) {
+ r = pidref_get_uid(pidref, &process_uid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
+
+ if (process_uid != sender_uid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid);
+ if (process_uid != u->ref_uid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid);
+ }
+
+ r = set_ensure_consume(&pids, &pidref_hash_ops_free, TAKE_PTR(pidref));
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ r = unit_attach_pids_to_cgroup(u, pids, path);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m");
+
+ return sd_bus_reply_method_return(message, NULL);
+}
+
+const sd_bus_vtable bus_unit_cgroup_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
+ SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0),
+ SD_BUS_PROPERTY("ControlGroupId", "t", NULL, offsetof(Unit, cgroup_id), 0),
+ SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0),
+ SD_BUS_PROPERTY("MemoryPeak", "t", property_get_memory_accounting, 0, 0),
+ SD_BUS_PROPERTY("MemorySwapCurrent", "t", property_get_memory_accounting, 0, 0),
+ SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0),
+ SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0),
+ SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0),
+ SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0),
+ SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0),
+ SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0),
+ SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0),
+ SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0),
+ SD_BUS_PROPERTY("IOReadBytes", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOReadOperations", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOWriteBytes", "t", property_get_io_counter, 0, 0),
+ SD_BUS_PROPERTY("IOWriteOperations", "t", property_get_io_counter, 0, 0),
+
+ SD_BUS_METHOD_WITH_ARGS("GetProcesses",
+ SD_BUS_NO_ARGS,
+ SD_BUS_ARGS("a(sus)", processes),
+ bus_unit_method_get_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_METHOD_WITH_ARGS("AttachProcesses",
+ SD_BUS_ARGS("s", subcgroup, "au", pids),
+ SD_BUS_NO_RESULT,
+ bus_unit_method_attach_processes,
+ SD_BUS_VTABLE_UNPRIVILEGED),
+
+ SD_BUS_VTABLE_END
+};
+
+static int send_new_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitNew");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "so", u->id, p);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+static int send_changed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ /* Send a properties changed signal. First for the specific
+ * type, then for the generic unit. The clients may rely on
+ * this order to get atomic behavior if needed. */
+
+ r = sd_bus_emit_properties_changed_strv(
+ bus, p,
+ unit_dbus_interface_from_type(u->type),
+ NULL);
+ if (r < 0)
+ return r;
+
+ return sd_bus_emit_properties_changed_strv(
+ bus, p,
+ "org.freedesktop.systemd1.Unit",
+ NULL);
+}
+
+void bus_unit_send_change_signal(Unit *u) {
+ int r;
+ assert(u);
+
+ if (u->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+ u->in_dbus_queue = false;
+
+ /* The unit might be good to be GC once its pending signals have been sent */
+ unit_add_to_gc_queue(u);
+ }
+
+ if (!u->id)
+ return;
+
+ r = bus_foreach_bus(u->manager, u->bus_track, u->sent_dbus_new_signal ? send_changed_signal : send_new_signal, u);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to send unit change signal for %s: %m", u->id);
+
+ u->sent_dbus_new_signal = true;
+}
+
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new) {
+
+ /* Sends out any pending change signals, but only if they really are pending. This call is used when we are
+ * about to change state in order to force out a PropertiesChanged signal beforehand if there was one pending
+ * so that clients can follow the full state transition */
+
+ if (!u->in_dbus_queue) /* If not enqueued, don't bother */
+ return;
+
+ if (!u->sent_dbus_new_signal && !including_new) /* If the unit was never announced, don't bother, it's fine if
+ * the unit appears in the new state right-away (except if the
+ * caller explicitly asked us to send it anyway) */
+ return;
+
+ if (MANAGER_IS_RELOADING(u->manager)) /* Don't generate unnecessary PropertiesChanged signals for the same unit
+ * when we are reloading. */
+ return;
+
+ bus_unit_send_change_signal(u);
+}
+
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ int r;
+
+ assert(u);
+
+ if (!u->pending_freezer_invocation)
+ return 0;
+
+ if (cancelled)
+ r = sd_bus_message_new_method_error(
+ u->pending_freezer_invocation,
+ &reply,
+ &SD_BUS_ERROR_MAKE_CONST(
+ BUS_ERROR_FREEZE_CANCELLED, "Freeze operation aborted"));
+ else
+ r = sd_bus_message_new_method_return(u->pending_freezer_invocation, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_send(NULL, reply, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+ u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+
+ return 0;
+}
+
+static int send_removed_signal(sd_bus *bus, void *userdata) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(bus);
+
+ p = unit_dbus_path(u);
+ if (!p)
+ return -ENOMEM;
+
+ r = sd_bus_message_new_signal(
+ bus,
+ &m,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "UnitRemoved");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "so", u->id, p);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(bus, m, NULL);
+}
+
+void bus_unit_send_removed_signal(Unit *u) {
+ int r;
+ assert(u);
+
+ if (!u->sent_dbus_new_signal || u->in_dbus_queue)
+ bus_unit_send_change_signal(u);
+
+ if (!u->id)
+ return;
+
+ r = bus_foreach_bus(u->manager, u->bus_track, send_removed_signal, u);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to send unit remove signal for %s: %m", u->id);
+}
+
+int bus_unit_queue_job_one(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_message *reply,
+ sd_bus_error *error) {
+
+ _cleanup_set_free_ Set *affected = NULL;
+ _cleanup_free_ char *job_path = NULL, *unit_path = NULL;
+ Job *j, *a;
+ int r;
+
+ if (FLAGS_SET(flags, BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE) && unit_can_reload(u)) {
+ if (type == JOB_RESTART)
+ type = JOB_RELOAD_OR_START;
+ else if (type == JOB_TRY_RESTART)
+ type = JOB_TRY_RELOAD;
+ }
+
+ if (type == JOB_STOP &&
+ IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_ERROR, UNIT_BAD_SETTING) &&
+ unit_active_state(u) == UNIT_INACTIVE)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not loaded.", u->id);
+
+ if ((type == JOB_START && u->refuse_manual_start) ||
+ (type == JOB_STOP && u->refuse_manual_stop) ||
+ (IN_SET(type, JOB_RESTART, JOB_TRY_RESTART) && (u->refuse_manual_start || u->refuse_manual_stop)) ||
+ (type == JOB_RELOAD_OR_START && job_type_collapse(type, u) == JOB_START && u->refuse_manual_start))
+ return sd_bus_error_setf(error,
+ BUS_ERROR_ONLY_BY_DEPENDENCY,
+ "Operation refused, unit %s may be requested by dependency only (it is configured to refuse manual start/stop).",
+ u->id);
+
+ /* dbus-broker issues StartUnit for activation requests, and Type=dbus services automatically
+ * gain dependency on dbus.socket. Therefore, if dbus has a pending stop job, the new start
+ * job that pulls in dbus again would cause job type conflict. Let's avoid that by rejecting
+ * job enqueuing early.
+ *
+ * Note that unlike signal_activation_request(), we can't use unit_inactive_or_pending()
+ * here. StartUnit is a more generic interface, and thus users are allowed to use e.g. systemctl
+ * to start Type=dbus services even when dbus is inactive. */
+ if (type == JOB_START && u->type == UNIT_SERVICE && SERVICE(u)->type == SERVICE_DBUS)
+ FOREACH_STRING(dbus_unit, SPECIAL_DBUS_SOCKET, SPECIAL_DBUS_SERVICE) {
+ Unit *dbus;
+
+ dbus = manager_get_unit(u->manager, dbus_unit);
+ if (dbus && unit_stop_pending(dbus))
+ return sd_bus_error_setf(error,
+ BUS_ERROR_SHUTTING_DOWN,
+ "Operation for unit %s refused, D-Bus is shutting down.",
+ u->id);
+ }
+
+ if (FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY)) {
+ affected = set_new(NULL);
+ if (!affected)
+ return -ENOMEM;
+ }
+
+ r = manager_add_job(u->manager, type, u, mode, affected, error, &j);
+ if (r < 0)
+ return r;
+
+ r = bus_job_track_sender(j, message);
+ if (r < 0)
+ return r;
+
+ /* Before we send the method reply, force out the announcement JobNew for this job */
+ bus_job_send_pending_change_signal(j, true);
+
+ job_path = job_dbus_path(j);
+ if (!job_path)
+ return -ENOMEM;
+
+ /* The classic response is just a job object path */
+ if (!FLAGS_SET(flags, BUS_UNIT_QUEUE_VERBOSE_REPLY))
+ return sd_bus_message_append(reply, "o", job_path);
+
+ /* In verbose mode respond with the anchor job plus everything that has been affected */
+
+ unit_path = unit_dbus_path(j->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "uosos",
+ j->id, job_path,
+ j->unit->id, unit_path,
+ job_type_to_string(j->type));
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(uosos)");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(a, affected) {
+ if (a->id == j->id)
+ continue;
+
+ /* Free paths from previous iteration */
+ job_path = mfree(job_path);
+ unit_path = mfree(unit_path);
+
+ job_path = job_dbus_path(a);
+ if (!job_path)
+ return -ENOMEM;
+
+ unit_path = unit_dbus_path(a->unit);
+ if (!unit_path)
+ return -ENOMEM;
+
+ r = sd_bus_message_append(reply, "(uosos)",
+ a->id, job_path,
+ a->unit->id, unit_path,
+ job_type_to_string(a->type));
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
+
+int bus_unit_queue_job(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ int r;
+
+ assert(message);
+ assert(u);
+ assert(type >= 0 && type < _JOB_TYPE_MAX);
+ assert(mode >= 0 && mode < _JOB_MODE_MAX);
+
+ r = mac_selinux_unit_access_check(
+ u, message,
+ job_type_to_access_method(type),
+ error);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_return(message, &reply);
+ if (r < 0)
+ return r;
+
+ r = bus_unit_queue_job_one(message, u, type, mode, flags, reply, error);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int bus_unit_set_live_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(message);
+
+ /* Handles setting properties both "live" (i.e. at any time during runtime), and during creation (for
+ * transient units that are being created). */
+
+ if (streq(name, "Description")) {
+ const char *d;
+
+ r = sd_bus_message_read(message, "s", &d);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_set_description(u, d);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, "Description=%s", d);
+ }
+
+ return 1;
+ }
+
+ /* A setting that only applies to active units. We don't actually write this to /run, this state is
+ * managed internally. "+foo" sets flag foo, "-foo" unsets flag foo, just "foo" resets flags to
+ * foo. The last type cannot be mixed with "+" or "-". */
+
+ if (streq(name, "Markers")) {
+ unsigned settings = 0, mask = 0;
+ bool some_plus_minus = false, some_absolute = false;
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *word;
+ bool b;
+
+ r = sd_bus_message_read(message, "s", &word);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (IN_SET(word[0], '+', '-')) {
+ b = word[0] == '+';
+ word++;
+ some_plus_minus = true;
+ } else {
+ b = true;
+ some_absolute = true;
+ }
+
+ UnitMarker m = unit_marker_from_string(word);
+ if (m < 0)
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING,
+ "Unknown marker \"%s\".", word);
+
+ SET_FLAG(settings, 1u << m, b);
+ SET_FLAG(mask, 1u << m, true);
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (some_plus_minus && some_absolute)
+ return sd_bus_error_set(error, BUS_ERROR_BAD_UNIT_SETTING, "Bad marker syntax.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (some_absolute)
+ u->markers = settings;
+ else
+ u->markers = settings | (u->markers & ~mask);
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_set_transient_emergency_action(
+ Unit *u,
+ const char *name,
+ EmergencyAction *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *s;
+ EmergencyAction v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ r = parse_emergency_action(s, u->manager->runtime_scope, &v);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ r == -EOPNOTSUPP ? "%s setting invalid for manager type: %s"
+ : "Invalid %s setting: %s",
+ name, s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_write_settingf(u, flags, name,
+ "%s=%s", name, s);
+ }
+
+ return 1;
+}
+
+static int bus_set_transient_exit_status(
+ Unit *u,
+ const char *name,
+ int *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int32_t k;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "i", &k);
+ if (r < 0)
+ return r;
+
+ if (k > 255)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Exit status must be in range 0…255 or negative.");
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = k < 0 ? -1 : k;
+
+ if (k < 0)
+ unit_write_settingf(u, flags, name, "%s=", name);
+ else
+ unit_write_settingf(u, flags, name, "%s=%i", name, k);
+ }
+
+ return 1;
+}
+
+static BUS_DEFINE_SET_TRANSIENT_PARSE(collect_mode, CollectMode, collect_mode_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(job_mode, JobMode, job_mode_from_string);
+
+static int bus_set_transient_conditions(
+ Unit *u,
+ const char *name,
+ Condition **list,
+ bool is_condition,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *type_name, *param;
+ int trigger, negate, r;
+ bool empty = true;
+
+ assert(list);
+
+ r = sd_bus_message_enter_container(message, 'a', "(sbbs)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(sbbs)", &type_name, &trigger, &negate, &param)) > 0) {
+ ConditionType t;
+
+ t = is_condition ? condition_type_from_string(type_name) : assert_type_from_string(type_name);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid condition type: %s", type_name);
+
+ if (isempty(param))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Condition parameter in %s is empty", type_name);
+
+ if (condition_takes_path(t) && !path_is_absolute(param))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path in condition %s is not absolute: %s", type_name, param);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ Condition *c;
+
+ c = condition_new(t, param, trigger, negate);
+ if (!c)
+ return -ENOMEM;
+
+ LIST_PREPEND(conditions, *list, c);
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s%s%s", type_name,
+ trigger ? "|" : "", negate ? "!" : "", param);
+ }
+
+ empty = false;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags) && empty) {
+ *list = condition_free_list(*list);
+ unit_write_settingf(u, flags, name, "%sNull=", is_condition ? "Condition" : "Assert");
+ }
+
+ return 1;
+}
+
+static int bus_unit_set_transient_property(
+ Unit *u,
+ const char *name,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ UnitDependency d;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(message);
+
+ /* Handles settings when transient units are created. This settings cannot be altered anymore after
+ * the unit has been created. */
+
+ if (streq(name, "SourcePath"))
+ return bus_set_transient_path(u, name, &u->source_path, message, flags, error);
+
+ if (streq(name, "StopWhenUnneeded"))
+ return bus_set_transient_bool(u, name, &u->stop_when_unneeded, message, flags, error);
+
+ if (streq(name, "RefuseManualStart"))
+ return bus_set_transient_bool(u, name, &u->refuse_manual_start, message, flags, error);
+
+ if (streq(name, "RefuseManualStop"))
+ return bus_set_transient_bool(u, name, &u->refuse_manual_stop, message, flags, error);
+
+ if (streq(name, "AllowIsolate"))
+ return bus_set_transient_bool(u, name, &u->allow_isolate, message, flags, error);
+
+ if (streq(name, "DefaultDependencies"))
+ return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
+
+ if (streq(name, "SurviveFinalKillSignal"))
+ return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
+
+ if (streq(name, "OnSuccessJobMode"))
+ return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
+
+ if (streq(name, "OnFailureJobMode"))
+ return bus_set_transient_job_mode(u, name, &u->on_failure_job_mode, message, flags, error);
+
+ if (streq(name, "IgnoreOnIsolate"))
+ return bus_set_transient_bool(u, name, &u->ignore_on_isolate, message, flags, error);
+
+ if (streq(name, "JobTimeoutUSec")) {
+ r = bus_set_transient_usec_fix_0(u, name, &u->job_timeout, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags) && !u->job_running_timeout_set)
+ u->job_running_timeout = u->job_timeout;
+ }
+
+ if (streq(name, "JobRunningTimeoutUSec")) {
+ r = bus_set_transient_usec_fix_0(u, name, &u->job_running_timeout, message, flags, error);
+ if (r >= 0 && !UNIT_WRITE_FLAGS_NOOP(flags))
+ u->job_running_timeout_set = true;
+
+ return r;
+ }
+
+ if (streq(name, "JobTimeoutAction"))
+ return bus_set_transient_emergency_action(u, name, &u->job_timeout_action, message, flags, error);
+
+ if (streq(name, "JobTimeoutRebootArgument"))
+ return bus_set_transient_string(u, name, &u->job_timeout_reboot_arg, message, flags, error);
+
+ if (streq(name, "StartLimitIntervalUSec"))
+ return bus_set_transient_usec(u, name, &u->start_ratelimit.interval, message, flags, error);
+
+ if (streq(name, "StartLimitBurst"))
+ return bus_set_transient_unsigned(u, name, &u->start_ratelimit.burst, message, flags, error);
+
+ if (streq(name, "StartLimitAction"))
+ return bus_set_transient_emergency_action(u, name, &u->start_limit_action, message, flags, error);
+
+ if (streq(name, "FailureAction"))
+ return bus_set_transient_emergency_action(u, name, &u->failure_action, message, flags, error);
+
+ if (streq(name, "SuccessAction"))
+ return bus_set_transient_emergency_action(u, name, &u->success_action, message, flags, error);
+
+ if (streq(name, "FailureActionExitStatus"))
+ return bus_set_transient_exit_status(u, name, &u->failure_action_exit_status, message, flags, error);
+
+ if (streq(name, "SuccessActionExitStatus"))
+ return bus_set_transient_exit_status(u, name, &u->success_action_exit_status, message, flags, error);
+
+ if (streq(name, "RebootArgument"))
+ return bus_set_transient_string(u, name, &u->reboot_arg, message, flags, error);
+
+ if (streq(name, "CollectMode"))
+ return bus_set_transient_collect_mode(u, name, &u->collect_mode, message, flags, error);
+
+ if (streq(name, "Conditions"))
+ return bus_set_transient_conditions(u, name, &u->conditions, true, message, flags, error);
+
+ if (streq(name, "Asserts"))
+ return bus_set_transient_conditions(u, name, &u->asserts, false, message, flags, error);
+
+ if (streq(name, "Documentation")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l)
+ if (!documentation_url_is_valid(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid URL in %s: %s", name, *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (strv_isempty(l)) {
+ u->documentation = strv_free(u->documentation);
+ unit_write_settingf(u, flags, name, "%s=", name);
+ } else {
+ strv_extend_strv(&u->documentation, l, false);
+
+ STRV_FOREACH(p, l)
+ unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+ }
+ }
+
+ return 1;
+
+ } else if (streq(name, "Slice")) {
+ Unit *slice;
+ const char *s;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups.");
+ if (u->type == UNIT_SLICE)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units.");
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope");
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!unit_name_is_valid(s, UNIT_NAME_PLAIN))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name '%s'", s);
+
+ /* Note that we do not dispatch the load queue here yet, as we don't want our own transient unit to be
+ * loaded while we are still setting it up. Or in other words, we use manager_load_unit_prepare()
+ * instead of manager_load_unit() on purpose, here. */
+ r = manager_load_unit_prepare(u->manager, s, NULL, error, &slice);
+ if (r < 0)
+ return r;
+
+ if (slice->type != UNIT_SLICE)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit name '%s' is not a slice", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_set_slice(u, slice);
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_PRIVATE, name, "Slice=%s", s);
+ }
+
+ return 1;
+
+ } else if (streq(name, "RequiresMountsFor")) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ r = sd_bus_message_read_strv(message, &l);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, l) {
+ path_simplify(*p);
+
+ if (!path_is_absolute(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not absolute: %s", name, *p);
+
+ if (!path_is_valid(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s has invalid length: %s", name, *p);
+
+ if (!path_is_normalized(*p))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Path specified in %s is not normalized: %s", name, *p);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = unit_require_mounts_for(u, *p, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Failed to add required mount \"%s\": %m", *p);
+
+ unit_write_settingf(u, flags, name, "%s=%s", name, *p);
+ }
+ }
+
+ return 1;
+ }
+
+ if (streq(name, "RequiresOverridable"))
+ d = UNIT_REQUIRES; /* redirect for obsolete unit dependency type */
+ else if (streq(name, "RequisiteOverridable"))
+ d = UNIT_REQUISITE; /* same here */
+ else
+ d = unit_dependency_from_string(name);
+
+ if (d >= 0) {
+ const char *other;
+
+ if (!IN_SET(d,
+ UNIT_REQUIRES,
+ UNIT_REQUISITE,
+ UNIT_WANTS,
+ UNIT_BINDS_TO,
+ UNIT_PART_OF,
+ UNIT_UPHOLDS,
+ UNIT_CONFLICTS,
+ UNIT_BEFORE,
+ UNIT_AFTER,
+ UNIT_ON_SUCCESS,
+ UNIT_ON_FAILURE,
+ UNIT_PROPAGATES_RELOAD_TO,
+ UNIT_RELOAD_PROPAGATED_FROM,
+ UNIT_PROPAGATES_STOP_TO,
+ UNIT_STOP_PROPAGATED_FROM,
+ UNIT_JOINS_NAMESPACE_OF))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Dependency type %s may not be created transiently.", unit_dependency_to_string(d));
+
+ r = sd_bus_message_enter_container(message, 'a', "s");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "s", &other)) > 0) {
+ if (!unit_name_is_valid(other, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid unit name %s", other);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ _cleanup_free_ char *label = NULL;
+
+ r = unit_add_dependency_by_name(u, d, other, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ label = strjoin(name, "-", other);
+ if (!label)
+ return -ENOMEM;
+
+ unit_write_settingf(u, flags, label, "%s=%s", unit_dependency_to_string(d), other);
+ }
+
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ } else if (streq(name, "AddRef")) {
+
+ int b;
+
+ /* Why is this called "AddRef" rather than just "Ref", or "Reference"? There's already a "Ref()" method
+ * on the Unit interface, and it's probably not a good idea to expose a property and a method on the
+ * same interface (well, strictly speaking AddRef isn't exposed as full property, we just read it for
+ * transient units, but still). And "References" and "ReferencedBy" is already used as unit reference
+ * dependency type, hence let's not confuse things with that.
+ *
+ * Note that we don't actually add the reference to the bus track. We do that only after the setup of
+ * the transient unit is complete, so that setting this property multiple times in the same transient
+ * unit creation call doesn't count as individual references. */
+
+ r = sd_bus_message_read(message, "b", &b);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ u->bus_track_add = b;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_unit_set_properties(
+ Unit *u,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ bool commit,
+ sd_bus_error *error) {
+
+ bool for_real = false;
+ unsigned n = 0;
+ int r;
+
+ assert(u);
+ assert(message);
+
+ /* We iterate through the array twice. First run just checks if all passed data is valid, second run
+ * actually applies it. This implements transaction-like behaviour without actually providing full
+ * transactions. */
+
+ r = sd_bus_message_enter_container(message, 'a', "(sv)");
+ if (r < 0)
+ goto error;
+
+ for (;;) {
+ const char *name;
+ UnitWriteFlags f;
+
+ r = sd_bus_message_enter_container(message, 'r', "sv");
+ if (r < 0)
+ goto error;
+ if (r == 0) {
+ if (for_real || UNIT_WRITE_FLAGS_NOOP(flags))
+ break;
+
+ /* Reached EOF. Let's try again, and this time for realz... */
+ r = sd_bus_message_rewind(message, false);
+ if (r < 0)
+ goto error;
+
+ for_real = true;
+ continue;
+ }
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0)
+ goto error;
+
+ r = sd_bus_message_enter_container(message, 'v', NULL);
+ if (r < 0)
+ goto error;
+
+ /* If not for real, then mask out the two target flags */
+ f = for_real ? flags : (flags & ~(UNIT_RUNTIME|UNIT_PERSISTENT));
+
+ if (UNIT_VTABLE(u)->bus_set_property)
+ r = UNIT_VTABLE(u)->bus_set_property(u, name, message, f, error);
+ else
+ r = 0;
+ if (r == 0 && u->transient && u->load_state == UNIT_STUB)
+ r = bus_unit_set_transient_property(u, name, message, f, error);
+ if (r == 0)
+ r = bus_unit_set_live_property(u, name, message, f, error);
+ if (r < 0)
+ goto error;
+
+ if (r == 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_PROPERTY_READ_ONLY,
+ "Cannot set property %s, or unknown property.", name);
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ n += for_real;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ goto error;
+
+ if (commit && n > 0 && UNIT_VTABLE(u)->bus_commit_properties)
+ UNIT_VTABLE(u)->bus_commit_properties(u);
+
+ return n;
+
+ error:
+ /* Pretty much any of the calls above can fail if the message is not formed properly
+ * or if it has unexpected contents. Fill in a more informative error message here. */
+ if (sd_bus_error_is_set(error))
+ return r;
+ return sd_bus_error_set_errnof(error, r,
+ r == -ENXIO ? "Failed to set unit properties: Unexpected message contents"
+ : "Failed to set unit properties: %m");
+}
+
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error) {
+ assert(u);
+
+ /* Generates a pretty error if a unit isn't properly loaded. */
+
+ switch (u->load_state) {
+
+ case UNIT_LOADED:
+ return 0;
+
+ case UNIT_NOT_FOUND:
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unit %s not found.", u->id);
+
+ case UNIT_BAD_SETTING:
+ return sd_bus_error_setf(error, BUS_ERROR_BAD_UNIT_SETTING, "Unit %s has a bad unit file setting.", u->id);
+
+ case UNIT_ERROR: /* Only show .load_error in UNIT_ERROR state */
+ return sd_bus_error_set_errnof(error, u->load_error,
+ "Unit %s failed to load properly, please adjust/correct and reload service manager: %m", u->id);
+
+ case UNIT_MASKED:
+ return sd_bus_error_setf(error, BUS_ERROR_UNIT_MASKED, "Unit %s is masked.", u->id);
+
+ case UNIT_STUB:
+ case UNIT_MERGED:
+ default:
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Unexpected load state of unit %s", u->id);
+ }
+}
+
+static int bus_unit_track_handler(sd_bus_track *t, void *userdata) {
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(t);
+
+ u->bus_track = sd_bus_track_unref(u->bus_track); /* make sure we aren't called again */
+
+ /* If the client that tracks us disappeared, then there's reason to believe that the cgroup is empty now too,
+ * let's see */
+ unit_add_to_cgroup_empty_queue(u);
+
+ /* Also add the unit to the GC queue, after all if the client left it might be time to GC this unit */
+ unit_add_to_gc_queue(u);
+
+ return 0;
+}
+
+static int bus_unit_allocate_bus_track(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->bus_track)
+ return 0;
+
+ r = sd_bus_track_new(u->manager->api_bus, &u->bus_track, bus_unit_track_handler, u);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_track_set_recursive(u->bus_track, true);
+ if (r < 0) {
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_unit_track_add_name(Unit *u, const char *name) {
+ int r;
+
+ assert(u);
+
+ r = bus_unit_allocate_bus_track(u);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_name(u->bus_track, name);
+}
+
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m) {
+ int r;
+
+ assert(u);
+
+ r = bus_unit_allocate_bus_track(u);
+ if (r < 0)
+ return r;
+
+ return sd_bus_track_add_sender(u->bus_track, m);
+}
+
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m) {
+ assert(u);
+
+ /* If we haven't allocated the bus track object yet, then there's definitely no reference taken yet,
+ * return an error */
+ if (!u->bus_track)
+ return -EUNATCH;
+
+ return sd_bus_track_remove_sender(u->bus_track, m);
+}
diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h
new file mode 100644
index 0000000..6b7828e
--- /dev/null
+++ b/src/core/dbus-unit.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "unit.h"
+
+extern const sd_bus_vtable bus_unit_vtable[];
+extern const sd_bus_vtable bus_unit_cgroup_vtable[];
+
+void bus_unit_send_change_signal(Unit *u);
+void bus_unit_send_pending_change_signal(Unit *u, bool including_new);
+int bus_unit_send_pending_freezer_message(Unit *u, bool cancelled);
+void bus_unit_send_removed_signal(Unit *u);
+
+int bus_unit_method_start_generic(sd_bus_message *message, Unit *u, JobType job_type, bool reload_if_possible, sd_bus_error *error);
+int bus_unit_method_enqueue_job(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_kill(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error);
+int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_freeze(sd_bus_message *message, void *userdata, sd_bus_error *error);
+int bus_unit_method_thaw(sd_bus_message *message, void *userdata, sd_bus_error *error);
+
+typedef enum BusUnitQueueFlags {
+ BUS_UNIT_QUEUE_RELOAD_IF_POSSIBLE = 1 << 0,
+ BUS_UNIT_QUEUE_VERBOSE_REPLY = 1 << 1,
+} BusUnitQueueFlags;
+
+int bus_unit_queue_job_one(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_message *reply,
+ sd_bus_error *error);
+int bus_unit_queue_job(
+ sd_bus_message *message,
+ Unit *u,
+ JobType type,
+ JobMode mode,
+ BusUnitQueueFlags flags,
+ sd_bus_error *error);
+int bus_unit_validate_load_state(Unit *u, sd_bus_error *error);
+
+int bus_unit_track_add_name(Unit *u, const char *name);
+int bus_unit_track_add_sender(Unit *u, sd_bus_message *m);
+int bus_unit_track_remove_sender(Unit *u, sd_bus_message *m);
diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c
new file mode 100644
index 0000000..d680a64
--- /dev/null
+++ b/src/core/dbus-util.c
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-util.h"
+#include "escape.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Unit *u = userdata, *trigger;
+
+ assert(bus);
+ assert(reply);
+ assert(u);
+
+ trigger = UNIT_TRIGGER(u);
+
+ return sd_bus_message_append(reply, "s", trigger ? trigger->id : NULL);
+}
+
+BUS_DEFINE_SET_TRANSIENT(mode_t, "u", uint32_t, mode_t, "%04o");
+BUS_DEFINE_SET_TRANSIENT(unsigned, "u", uint32_t, unsigned, "%" PRIu32);
+
+static bool valid_user_group_name_or_id_relaxed(const char *u) {
+ return valid_user_group_name(u, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX);
+}
+
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(user_relaxed, valid_user_group_name_or_id_relaxed);
+BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(path, path_is_absolute);
+
+int bus_set_transient_string(
+ Unit *u,
+ const char *name,
+ char **p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ r = free_and_strdup(p, empty_to_null(v));
+ if (r < 0)
+ return r;
+
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name,
+ "%s=%s", name, strempty(v));
+ }
+
+ return 1;
+}
+
+int bus_set_transient_bool(
+ Unit *u,
+ const char *name,
+ bool *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int v, r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "b", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+ }
+
+ return 1;
+}
+
+int bus_set_transient_tristate(
+ Unit *u,
+ const char *name,
+ int *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ int v, r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "b", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = v;
+ unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v));
+ }
+
+ return 1;
+}
+
+int bus_set_transient_usec_internal(
+ Unit *u,
+ const char *name,
+ usec_t *p,
+ bool fix_0,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ uint64_t v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "t", &v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (fix_0)
+ *p = v != 0 ? v: USEC_INFINITY;
+ else
+ *p = v;
+
+ char *n = strndupa_safe(name, strlen(name) - 4);
+ unit_write_settingf(u, flags, name, "%sSec=%s", n, FORMAT_TIMESPAN(v, USEC_PER_MSEC));
+ }
+
+ return 1;
+}
+
+int bus_verify_manage_units_async_full(
+ Unit *u,
+ const char *verb,
+ int capability,
+ const char *polkit_message,
+ bool interactive,
+ sd_bus_message *call,
+ sd_bus_error *error) {
+
+ const char *details[9] = {
+ "unit", u->id,
+ "verb", verb,
+ };
+
+ if (polkit_message) {
+ details[4] = "polkit.message";
+ details[5] = polkit_message;
+ details[6] = "polkit.gettext_domain";
+ details[7] = GETTEXT_PACKAGE;
+ }
+
+ return bus_verify_polkit_async(
+ call,
+ capability,
+ "org.freedesktop.systemd1.manage-units",
+ details,
+ interactive,
+ UID_INVALID,
+ &u->manager->polkit_registry,
+ error);
+}
+
+/* ret_format_str is an accumulator, so if it has any pre-existing content, new options will be appended to it */
+int bus_read_mount_options(
+ sd_bus_message *message,
+ sd_bus_error *error,
+ MountOptions **ret_options,
+ char **ret_format_str,
+ const char *separator) {
+
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *format_str = NULL;
+ const char *mount_options, *partition;
+ int r;
+
+ assert(message);
+ assert(ret_options);
+ assert(separator);
+
+ r = sd_bus_message_enter_container(message, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read(message, "(ss)", &partition, &mount_options)) > 0) {
+ _cleanup_free_ char *escaped = NULL;
+ _cleanup_free_ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ if (chars_intersect(mount_options, WHITESPACE))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
+ "Invalid mount options string, contains whitespace character(s): %s", mount_options);
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid partition name %s", partition);
+
+ /* Need to store the options with the escapes, so that they can be parsed again */
+ escaped = shell_escape(mount_options, ":");
+ if (!escaped)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&format_str, separator, partition, ":", escaped))
+ return -ENOMEM;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return -ENOMEM;
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = strdup(mount_options),
+ };
+ if (!o->options)
+ return -ENOMEM;
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ if (options) {
+ if (ret_format_str) {
+ char *final = strjoin(*ret_format_str, !isempty(*ret_format_str) ? separator : "", format_str);
+ if (!final)
+ return -ENOMEM;
+ free_and_replace(*ret_format_str, final);
+ }
+ LIST_JOIN(mount_options, *ret_options, options);
+ }
+
+ return 0;
+}
+
+int bus_property_get_activation_details(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ActivationDetails **details = ASSERT_PTR(userdata);
+ _cleanup_strv_free_ char **pairs = NULL;
+ int r;
+
+ assert(reply);
+
+ r = activation_details_append_pair(*details, &pairs);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "(ss)");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH_PAIR(key, value, pairs) {
+ r = sd_bus_message_append(reply, "(ss)", *key, *value);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(reply);
+}
diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h
new file mode 100644
index 0000000..9464b25
--- /dev/null
+++ b/src/core/dbus-util.h
@@ -0,0 +1,256 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "dissect-image.h"
+#include "unit.h"
+
+int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+#define BUS_DEFINE_SET_TRANSIENT(function, bus_type, type, cast_type, fmt) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=" fmt, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_IS_VALID(function, bus_type, type, cast_type, fmt, check) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=" fmt, name, v); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING(function, bus_type, type, cast_type, fmt, to_string) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ s = to_string(v); \
+ if (!s) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, s); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(function, bus_type, type, cast_type, fmt, to_string) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ cast_type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ _cleanup_free_ char *s = NULL; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, bus_type, &v); \
+ if (r < 0) \
+ return r; \
+ \
+ r = to_string(v, &s); \
+ if (r == -EINVAL) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: " fmt, name, v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = (cast_type) v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", \
+ name, strempty(s)); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE(function, type, parse) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &s); \
+ if (r < 0) \
+ return r; \
+ \
+ v = parse(s); \
+ if (v < 0) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, s); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, s); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(function, type, parse) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ type *p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *s; \
+ type v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &s); \
+ if (r < 0) \
+ return r; \
+ \
+ r = parse(s, &v); \
+ if (r < 0) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, s); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ *p = v; \
+ unit_write_settingf(u, flags, name, \
+ "%s=%s", name, strempty(s)); \
+ } \
+ \
+ return 1; \
+ }
+
+#define BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(function, check) \
+ int bus_set_transient_##function( \
+ Unit *u, \
+ const char *name, \
+ char **p, \
+ sd_bus_message *message, \
+ UnitWriteFlags flags, \
+ sd_bus_error *error) { \
+ \
+ const char *v; \
+ int r; \
+ \
+ assert(p); \
+ \
+ r = sd_bus_message_read(message, "s", &v); \
+ if (r < 0) \
+ return r; \
+ \
+ if (!isempty(v) && !check(v)) \
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, \
+ "Invalid %s setting: %s", name, v); \
+ \
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { \
+ r = free_and_strdup(p, empty_to_null(v)); \
+ if (r < 0) \
+ return r; \
+ \
+ unit_write_settingf(u, flags|UNIT_ESCAPE_SPECIFIERS, name, \
+ "%s=%s", name, strempty(v)); \
+ } \
+ \
+ return 1; \
+ }
+
+int bus_set_transient_mode_t(Unit *u, const char *name, mode_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_unsigned(Unit *u, const char *name, unsigned *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+ return bus_set_transient_usec_internal(u, name, p, false, message, flags, error);
+}
+static inline int bus_set_transient_usec_fix_0(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
+ return bus_set_transient_usec_internal(u, name, p, true, message, flags, error);
+}
+int bus_verify_manage_units_async_full(Unit *u, const char *verb, int capability, const char *polkit_message, bool interactive, sd_bus_message *call, sd_bus_error *error);
+
+int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator);
+
+int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
diff --git a/src/core/dbus.c b/src/core/dbus.c
new file mode 100644
index 0000000..ba2cec4
--- /dev/null
+++ b/src/core/dbus.c
@@ -0,0 +1,1273 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "dbus-automount.h"
+#include "dbus-cgroup.h"
+#include "dbus-device.h"
+#include "dbus-execute.h"
+#include "dbus-job.h"
+#include "dbus-kill.h"
+#include "dbus-manager.h"
+#include "dbus-mount.h"
+#include "dbus-path.h"
+#include "dbus-scope.h"
+#include "dbus-service.h"
+#include "dbus-slice.h"
+#include "dbus-socket.h"
+#include "dbus-swap.h"
+#include "dbus-target.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "process-util.h"
+#include "selinux-access.h"
+#include "serialize.h"
+#include "service.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define CONNECTIONS_MAX 4096
+
+static void destroy_bus(Manager *m, sd_bus **bus);
+
+int bus_send_pending_reload_message(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->pending_reload_message)
+ return 0;
+
+ /* If we cannot get rid of this message we won't dispatch any D-Bus messages, so that we won't end up wanting
+ * to queue another message. */
+
+ r = sd_bus_send(NULL, m->pending_reload_message, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to send queued message, ignoring: %m");
+
+ m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+ return 0;
+}
+
+int bus_forward_agent_released(Manager *m, const char *path) {
+ int r;
+
+ assert(m);
+ assert(path);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ if (!m->system_bus)
+ return 0;
+
+ /* If we are running a system instance we forward the agent message on the system bus, so that the user
+ * instances get notified about this, too */
+
+ r = sd_bus_emit_signal(m->system_bus,
+ "/org/freedesktop/systemd1/agent",
+ "org.freedesktop.systemd1.Agent",
+ "Released",
+ "s", path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to propagate agent release message: %m");
+
+ return 1;
+}
+
+static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *cgroup;
+ uid_t sender_uid;
+ int r;
+
+ assert(message);
+
+ /* only accept org.freedesktop.systemd1.Agent from UID=0 */
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0 || sender_uid != 0)
+ return 0;
+
+ /* parse 'cgroup-empty' notification */
+ r = sd_bus_message_read(message, "s", &cgroup);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ manager_notify_cgroup_empty(m, cgroup);
+ return 0;
+}
+
+static int signal_disconnected(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ sd_bus *bus;
+
+ assert(message);
+ assert_se(bus = sd_bus_message_get_bus(message));
+
+ if (bus == m->api_bus)
+ bus_done_api(m);
+ if (bus == m->system_bus)
+ bus_done_system(m);
+
+ if (set_remove(m->private_buses, bus)) {
+ log_debug("Got disconnect on private connection.");
+ destroy_bus(m, &bus);
+ }
+
+ return 0;
+}
+
+static int signal_activation_request(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ const char *name;
+ Unit *u;
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "s", &name);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ if (manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SOCKET) ||
+ manager_unit_inactive_or_pending(m, SPECIAL_DBUS_SERVICE)) {
+ r = sd_bus_error_set(&error, BUS_ERROR_SHUTTING_DOWN, "Refusing activation, D-Bus is shutting down.");
+ goto failed;
+ }
+
+ r = manager_load_unit(m, name, NULL, &error, &u);
+ if (r < 0)
+ goto failed;
+
+ if (u->refuse_manual_start) {
+ r = sd_bus_error_setf(&error, BUS_ERROR_ONLY_BY_DEPENDENCY, "Operation refused, %s may be requested by dependency only (it is configured to refuse manual start/stop).", u->id);
+ goto failed;
+ }
+
+ r = manager_add_job(m, JOB_START, u, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ goto failed;
+
+ /* Successfully queued, that's it for us */
+ return 0;
+
+failed:
+ if (!sd_bus_error_is_set(&error))
+ sd_bus_error_set_errno(&error, r);
+
+ log_debug("D-Bus activation failed for %s: %s", name, bus_error_message(&error, r));
+
+ r = sd_bus_message_new_signal(sd_bus_message_get_bus(message), &reply, "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Activator", "ActivationFailure");
+ if (r < 0) {
+ bus_log_create_error(r);
+ return 0;
+ }
+
+ r = sd_bus_message_append(reply, "sss", name, error.name, error.message);
+ if (r < 0) {
+ bus_log_create_error(r);
+ return 0;
+ }
+
+ r = sd_bus_send_to(NULL, reply, "org.freedesktop.DBus", NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to respond with to bus activation request: %m");
+
+ return 0;
+}
+
+#if HAVE_SELINUX
+static int mac_selinux_filter(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ Manager *m = userdata;
+ const char *verb, *path;
+ Unit *u = NULL;
+ Job *j;
+ int r;
+
+ assert(message);
+
+ /* Our own method calls are all protected individually with
+ * selinux checks, but the built-in interfaces need to be
+ * protected too. */
+
+ if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", "Set"))
+ verb = "reload";
+ else if (sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Introspectable", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Properties", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.ObjectManager", NULL) ||
+ sd_bus_message_is_method_call(message, "org.freedesktop.DBus.Peer", NULL))
+ verb = "status";
+ else
+ return 0;
+
+ path = sd_bus_message_get_path(message);
+
+ if (object_path_startswith("/org/freedesktop/systemd1", path)) {
+ r = mac_selinux_access_check(message, verb, error);
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ pid_t pid;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return 0;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return 0;
+
+ u = manager_get_unit_by_pid(m, pid);
+ } else {
+ r = manager_get_job_from_dbus_path(m, path, &j);
+ if (r >= 0)
+ u = j->unit;
+ else
+ manager_load_unit_from_dbus_path(m, path, NULL, &u);
+ }
+ if (!u)
+ return 0;
+
+ r = mac_selinux_unit_access_check(u, message, verb, error);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+#endif
+
+static int find_unit(Manager *m, sd_bus *bus, const char *path, Unit **unit, sd_bus_error *error) {
+ Unit *u = NULL; /* just to appease gcc, initialization is not really necessary */
+ int r;
+
+ assert(m);
+ assert(bus);
+ assert(path);
+
+ if (streq_ptr(path, "/org/freedesktop/systemd1/unit/self")) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ sd_bus_message *message;
+ pid_t pid;
+
+ message = sd_bus_get_current_message(bus);
+ if (!message)
+ return 0;
+
+ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_creds_get_pid(creds, &pid);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit_by_pid(m, pid);
+ if (!u)
+ return 0;
+ } else {
+ r = manager_load_unit_from_dbus_path(m, path, error, &u);
+ if (r < 0)
+ return 0;
+ assert(u);
+ }
+
+ *unit = u;
+ return 1;
+}
+
+static int bus_unit_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ return find_unit(m, bus, path, (Unit**) found, error);
+}
+
+static int bus_unit_interface_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ *found = u;
+ return 1;
+}
+
+static int bus_unit_cgroup_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ *found = u;
+ return 1;
+}
+
+static int bus_cgroup_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ CGroupContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_exec_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ ExecContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_exec_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_kill_context_find(sd_bus *bus, const char *path, const char *interface, void *userdata, void **found, sd_bus_error *error) {
+ Manager *m = ASSERT_PTR(userdata);
+ KillContext *c;
+ Unit *u;
+ int r;
+
+ assert(bus);
+ assert(path);
+ assert(interface);
+ assert(found);
+
+ r = find_unit(m, bus, path, &u, error);
+ if (r <= 0)
+ return r;
+
+ if (!streq_ptr(interface, unit_dbus_interface_from_type(u->type)))
+ return 0;
+
+ c = unit_get_kill_context(u);
+ if (!c)
+ return 0;
+
+ *found = c;
+ return 1;
+}
+
+static int bus_unit_enumerate(sd_bus *bus, const char *path, void *userdata, char ***nodes, sd_bus_error *error) {
+ _cleanup_strv_free_ char **l = NULL;
+ Manager *m = userdata;
+ unsigned k = 0;
+ Unit *u;
+
+ l = new0(char*, hashmap_size(m->units)+1);
+ if (!l)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(u, m->units) {
+ l[k] = unit_dbus_path(u);
+ if (!l[k])
+ return -ENOMEM;
+
+ k++;
+ }
+
+ *nodes = TAKE_PTR(l);
+
+ return k;
+}
+
+static const BusObjectImplementation unit_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Unit",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_unit_vtable, bus_unit_find }),
+ .node_enumerator = bus_unit_enumerate,
+};
+
+static const BusObjectImplementation bus_automount_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Automount",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_automount_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_device_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Device",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_device_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_mount_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Mount",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_mount_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_path_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Path",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_path_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_scope_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Scope",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_scope_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_service_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Service",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_service_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_slice_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Slice",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_slice_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find }),
+};
+
+static const BusObjectImplementation bus_socket_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Socket",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_socket_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_swap_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Swap",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_swap_vtable, bus_unit_interface_find },
+ { bus_unit_cgroup_vtable, bus_unit_cgroup_find },
+ { bus_cgroup_vtable, bus_cgroup_context_find },
+ { bus_exec_vtable, bus_exec_context_find },
+ { bus_kill_vtable, bus_kill_context_find }),
+};
+
+static const BusObjectImplementation bus_target_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Target",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_target_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_timer_object = {
+ "/org/freedesktop/systemd1/unit",
+ "org.freedesktop.systemd1.Timer",
+ .fallback_vtables = BUS_FALLBACK_VTABLES(
+ { bus_timer_vtable, bus_unit_interface_find }),
+};
+
+static const BusObjectImplementation bus_manager_object = {
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ .vtables = BUS_VTABLES(bus_manager_vtable),
+ .children = BUS_IMPLEMENTATIONS(
+ &job_object,
+ &unit_object,
+ &bus_automount_object,
+ &bus_device_object,
+ &bus_mount_object,
+ &bus_path_object,
+ &bus_scope_object,
+ &bus_service_object,
+ &bus_slice_object,
+ &bus_socket_object,
+ &bus_swap_object,
+ &bus_target_object,
+ &bus_timer_object),
+};
+
+static const BusObjectImplementation manager_log_control_object = {
+ "/org/freedesktop/LogControl1",
+ "org.freedesktop.LogControl1",
+ .vtables = BUS_VTABLES(bus_manager_log_control_vtable),
+};
+
+int bus_manager_introspect_implementations(FILE *out, const char *pattern) {
+ return bus_introspect_implementations(
+ out,
+ pattern,
+ BUS_IMPLEMENTATIONS(&bus_manager_object,
+ &manager_log_control_object));
+}
+
+static int bus_setup_api_vtables(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+#if HAVE_SELINUX
+ r = sd_bus_add_filter(bus, NULL, mac_selinux_filter, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add SELinux access filter: %m");
+#endif
+
+ r = bus_add_implementation(bus, &bus_manager_object, m);
+ if (r < 0)
+ return r;
+
+ return bus_add_implementation(bus, &manager_log_control_object, m);
+}
+
+static int bus_setup_disconnected_match(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ "org.freedesktop.DBus.Local",
+ "/org/freedesktop/DBus/Local",
+ "org.freedesktop.DBus.Local",
+ "Disconnected",
+ signal_disconnected, NULL, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request match for Disconnected message: %m");
+
+ return 0;
+}
+
+static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_close_ int nfd = -EBADF;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_id128_t id;
+ int r;
+
+ assert(s);
+
+ nfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+ if (nfd < 0) {
+ if (ERRNO_IS_ACCEPT_AGAIN(errno))
+ return 0;
+
+ log_warning_errno(errno, "Failed to accept private connection, ignoring: %m");
+ return 0;
+ }
+
+ if (set_size(m->private_buses) >= CONNECTIONS_MAX) {
+ log_warning("Too many concurrent connections, refusing");
+ return 0;
+ }
+
+ r = sd_bus_new(&bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to allocate new private connection bus: %m");
+ return 0;
+ }
+
+ (void) sd_bus_set_description(bus, "private-bus-connection");
+
+ r = sd_bus_set_fd(bus, nfd, nfd);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set fd on new connection bus: %m");
+ return 0;
+ }
+
+ TAKE_FD(nfd);
+
+ r = bus_check_peercred(bus);
+ if (r < 0) {
+ log_warning_errno(r, "Incoming private connection from unprivileged client, refusing: %m");
+ return 0;
+ }
+
+ assert_se(sd_id128_randomize(&id) >= 0);
+
+ r = sd_bus_set_server(bus, 1, id);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to enable server support for new connection bus: %m");
+ return 0;
+ }
+
+ r = sd_bus_negotiate_creds(bus, 1,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+ SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+ SD_BUS_CREDS_SELINUX_CONTEXT|
+ SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to enable credentials for new connection: %m");
+ return 0;
+ }
+
+ r = sd_bus_set_sender(bus, "org.freedesktop.systemd1");
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set direct connection sender: %m");
+ return 0;
+ }
+
+ r = sd_bus_start(bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to start new connection bus: %m");
+ return 0;
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *c = NULL;
+ const char *comm = NULL, *description = NULL;
+ pid_t pid = 0;
+
+ r = sd_bus_get_owner_creds(bus, SD_BUS_CREDS_PID|SD_BUS_CREDS_COMM|SD_BUS_CREDS_DESCRIPTION, &c);
+ if (r < 0)
+ log_warning_errno(r, "Failed to get peer creds, ignoring: %m");
+ else {
+ (void) sd_bus_creds_get_pid(c, &pid);
+ (void) sd_bus_creds_get_comm(c, &comm);
+ (void) sd_bus_creds_get_description(c, &description);
+ }
+
+ log_debug("Accepting direct incoming connection from " PID_FMT " (%s) [%s]", pid, strna(comm), strna(description));
+ }
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to attach new connection bus to event loop: %m");
+ return 0;
+ }
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return 0;
+
+ r = bus_setup_api_vtables(m, bus);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to set up API vtables on new connection bus: %m");
+ return 0;
+ }
+
+ r = bus_register_malloc_status(bus, "org.freedesktop.systemd1");
+ if (r < 0)
+ log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m");
+
+ r = set_ensure_put(&m->private_buses, NULL, bus);
+ if (r == -ENOMEM) {
+ log_oom();
+ return 0;
+ }
+ if (r < 0) {
+ log_warning_errno(r, "Failed to add new connection bus to set: %m");
+ return 0;
+ }
+
+ TAKE_PTR(bus);
+
+ log_debug("Accepted new private connection.");
+
+ return 0;
+}
+
+static int bus_setup_api(Manager *m, sd_bus *bus) {
+ char *name;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ /* Let's make sure we have enough credential bits so that we can make security and selinux decisions */
+ r = sd_bus_negotiate_creds(bus, 1,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_UID|
+ SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS|
+ SD_BUS_CREDS_SELINUX_CONTEXT);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable credential passing, ignoring: %m");
+
+ r = bus_setup_api_vtables(m, bus);
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(u, name, m->watch_bus) {
+ r = unit_install_bus_match(u, bus, name);
+ if (r < 0)
+ log_error_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+ }
+
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.systemd1.Activator",
+ "ActivationRequest",
+ signal_activation_request, NULL, m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to subscribe to activation signal: %m");
+
+ /* Allow replacing of our name, to ease implementation of reexecution, where we keep the old connection open
+ * until after the new connection is set up and the name installed to allow clients to synchronously wait for
+ * reexecution to finish */
+ r = sd_bus_request_name_async(bus, NULL, "org.freedesktop.systemd1", SD_BUS_NAME_REPLACE_EXISTING|SD_BUS_NAME_ALLOW_REPLACEMENT, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request name: %m");
+
+ r = bus_register_malloc_status(bus, "org.freedesktop.systemd1");
+ if (r < 0)
+ log_warning_errno(r, "Failed to register MemoryAllocation1, ignoring: %m");
+
+ log_debug("Successfully connected to API bus.");
+
+ return 0;
+}
+
+int bus_init_api(Manager *m) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ if (m->api_bus)
+ return 0;
+
+ /* The API and system bus is the same if we are running in system mode */
+ if (MANAGER_IS_SYSTEM(m) && m->system_bus)
+ bus = sd_bus_ref(m->system_bus);
+ else {
+ if (MANAGER_IS_SYSTEM(m))
+ r = sd_bus_open_system_with_description(&bus, "bus-api-system");
+ else
+ r = sd_bus_open_user_with_description(&bus, "bus-api-user");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to API bus: %m");
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach API bus to event loop: %m");
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_setup_api(m, bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up API bus: %m");
+
+ m->api_bus = TAKE_PTR(bus);
+
+ return 0;
+}
+
+static int bus_setup_system(Manager *m, sd_bus *bus) {
+ int r;
+
+ assert(m);
+ assert(bus);
+
+ /* if we are a user instance we get the Released message via the system bus */
+ if (MANAGER_IS_USER(m)) {
+ r = sd_bus_match_signal_async(
+ bus,
+ NULL,
+ NULL,
+ "/org/freedesktop/systemd1/agent",
+ "org.freedesktop.systemd1.Agent",
+ "Released",
+ signal_agent_released, NULL, m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to request Released match on system bus: %m");
+ }
+
+ log_debug("Successfully connected to system bus.");
+ return 0;
+}
+
+int bus_init_system(Manager *m) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ if (m->system_bus)
+ return 0;
+
+ /* The API and system bus is the same if we are running in system mode */
+ if (MANAGER_IS_SYSTEM(m) && m->api_bus)
+ bus = sd_bus_ref(m->api_bus);
+ else {
+ r = sd_bus_open_system_with_description(&bus, "bus-system");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to system bus: %m");
+
+ r = sd_bus_attach_event(bus, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach system bus to event loop: %m");
+
+ r = bus_setup_disconnected_match(m, bus);
+ if (r < 0)
+ return r;
+ }
+
+ r = bus_setup_system(m, bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up system bus: %m");
+
+ m->system_bus = TAKE_PTR(bus);
+
+ return 0;
+}
+
+int bus_init_private(Manager *m) {
+ _cleanup_close_ int fd = -EBADF;
+ union sockaddr_union sa;
+ socklen_t sa_len;
+ sd_event_source *s;
+ int r;
+
+ assert(m);
+
+ if (m->private_listen_fd >= 0)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(m)) {
+
+ /* We want the private bus only when running as init */
+ if (getpid_cached() != 1)
+ return 0;
+
+ r = sockaddr_un_set_path(&sa.un, "/run/systemd/private");
+ } else {
+ _cleanup_free_ char *joined = NULL;
+ const char *e;
+
+ e = secure_getenv("XDG_RUNTIME_DIR");
+ if (!e)
+ return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+ "XDG_RUNTIME_DIR is not set, refusing.");
+
+ joined = path_join(e, "/systemd/private");
+ if (!joined)
+ return log_oom();
+
+ r = sockaddr_un_set_path(&sa.un, joined);
+ }
+ if (r < 0)
+ return log_error_errno(r, "Can't set path for AF_UNIX socket to bind to: %m");
+ sa_len = r;
+
+ (void) mkdir_parents_label(sa.un.sun_path, 0755);
+ (void) sockaddr_un_unlink(&sa.un);
+
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate private socket: %m");
+
+ WITH_UMASK(0077)
+ r = bind(fd, &sa.sa, sa_len);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to bind private socket: %m");
+
+ r = listen(fd, SOMAXCONN_DELUXE);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to make private socket listening: %m");
+
+ /* Generate an inotify event in case somebody waits for this socket to appear using inotify() */
+ (void) touch(sa.un.sun_path);
+
+ r = sd_event_add_io(m->event, &s, fd, EPOLLIN, bus_on_connection, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate event source: %m");
+
+ (void) sd_event_source_set_description(s, "bus-connection");
+
+ m->private_listen_fd = TAKE_FD(fd);
+ m->private_listen_event_source = s;
+
+ log_debug("Successfully created private D-Bus server.");
+
+ return 0;
+}
+
+static void destroy_bus(Manager *m, sd_bus **bus) {
+ Unit *u;
+ Job *j;
+
+ assert(m);
+ assert(bus);
+
+ if (!*bus)
+ return;
+
+ /* Make sure all bus slots watching names are released. */
+ HASHMAP_FOREACH(u, m->watch_bus) {
+ if (u->match_bus_slot && sd_bus_slot_get_bus(u->match_bus_slot) == *bus)
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ if (u->get_name_owner_slot && sd_bus_slot_get_bus(u->get_name_owner_slot) == *bus)
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+ }
+
+ /* Get rid of tracked clients on this bus */
+ if (m->subscribed && sd_bus_track_get_bus(m->subscribed) == *bus)
+ m->subscribed = sd_bus_track_unref(m->subscribed);
+
+ HASHMAP_FOREACH(j, m->jobs)
+ if (j->bus_track && sd_bus_track_get_bus(j->bus_track) == *bus)
+ j->bus_track = sd_bus_track_unref(j->bus_track);
+
+ HASHMAP_FOREACH(u, m->units) {
+ if (u->bus_track && sd_bus_track_get_bus(u->bus_track) == *bus)
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+
+ /* Get rid of pending freezer messages on this bus */
+ if (u->pending_freezer_invocation && sd_bus_message_get_bus(u->pending_freezer_invocation) == *bus)
+ u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+ }
+
+ /* Get rid of queued message on this bus */
+ if (m->pending_reload_message && sd_bus_message_get_bus(m->pending_reload_message) == *bus)
+ m->pending_reload_message = sd_bus_message_unref(m->pending_reload_message);
+
+ /* Possibly flush unwritten data, but only if we are
+ * unprivileged, since we don't want to sync here */
+ if (!MANAGER_IS_SYSTEM(m))
+ sd_bus_flush(*bus);
+
+ /* And destroy the object */
+ *bus = sd_bus_close_unref(*bus);
+}
+
+void bus_done_api(Manager *m) {
+ destroy_bus(m, &m->api_bus);
+}
+
+void bus_done_system(Manager *m) {
+ destroy_bus(m, &m->system_bus);
+}
+
+void bus_done_private(Manager *m) {
+ sd_bus *b;
+
+ assert(m);
+
+ while ((b = set_steal_first(m->private_buses)))
+ destroy_bus(m, &b);
+
+ m->private_buses = set_free(m->private_buses);
+
+ m->private_listen_event_source = sd_event_source_disable_unref(m->private_listen_event_source);
+ m->private_listen_fd = safe_close(m->private_listen_fd);
+}
+
+void bus_done(Manager *m) {
+ assert(m);
+
+ bus_done_api(m);
+ bus_done_system(m);
+ bus_done_private(m);
+
+ assert(!m->subscribed);
+
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+ bus_verify_polkit_async_registry_free(m->polkit_registry);
+}
+
+int bus_fdset_add_all(Manager *m, FDSet *fds) {
+ sd_bus *b;
+ int fd;
+
+ assert(m);
+ assert(fds);
+
+ /* When we are about to reexecute we add all D-Bus fds to the
+ * set to pass over to the newly executed systemd. They won't
+ * be used there however, except thatt they are closed at the
+ * very end of deserialization, those making it possible for
+ * clients to synchronously wait for systemd to reexec by
+ * simply waiting for disconnection */
+
+ if (m->api_bus) {
+ fd = sd_bus_get_fd(m->api_bus);
+ if (fd >= 0) {
+ fd = fdset_put_dup(fds, fd);
+ if (fd < 0)
+ return fd;
+ }
+ }
+
+ SET_FOREACH(b, m->private_buses) {
+ fd = sd_bus_get_fd(b);
+ if (fd >= 0) {
+ fd = fdset_put_dup(fds, fd);
+ if (fd < 0)
+ return fd;
+ }
+ }
+
+ /* We don't offer any APIs on the system bus (well, unless it
+ * is the same as the API bus) hence we don't bother with it
+ * here */
+
+ return 0;
+}
+
+int bus_foreach_bus(
+ Manager *m,
+ sd_bus_track *subscribed2,
+ int (*send_message)(sd_bus *bus, void *userdata),
+ void *userdata) {
+
+ sd_bus *b;
+ int r, ret = 0;
+
+ /* Send to all direct buses, unconditionally */
+ SET_FOREACH(b, m->private_buses) {
+
+ /* Don't bother with enqueuing these messages to clients that haven't started yet */
+ if (sd_bus_is_ready(b) <= 0)
+ continue;
+
+ r = send_message(b, userdata);
+ if (r < 0)
+ ret = r;
+ }
+
+ /* Send to API bus, but only if somebody is subscribed */
+ if (m->api_bus &&
+ (sd_bus_track_count(m->subscribed) > 0 ||
+ sd_bus_track_count(subscribed2) > 0)) {
+ r = send_message(m->api_bus, userdata);
+ if (r < 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix) {
+ const char *n;
+
+ assert(f);
+ assert(prefix);
+
+ for (n = sd_bus_track_first(t); n; n = sd_bus_track_next(t)) {
+ int c, j;
+
+ c = sd_bus_track_count_name(t, n);
+ for (j = 0; j < c; j++)
+ (void) serialize_item(f, prefix, n);
+ }
+}
+
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l) {
+ int r;
+
+ assert(m);
+ assert(t);
+
+ if (strv_isempty(l))
+ return 0;
+
+ if (!m->api_bus)
+ return 0;
+
+ if (!*t) {
+ r = sd_bus_track_new(m->api_bus, t, NULL, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_track_set_recursive(*t, recursive);
+ if (r < 0)
+ return r;
+
+ return bus_track_add_name_many(*t, l);
+}
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-units", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.manage-unit-files", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.reload-daemon", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.set-environment", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error) {
+ return bus_verify_polkit_async(call, CAP_SYS_ADMIN, "org.freedesktop.systemd1.bypass-dump-ratelimit", NULL, false, UID_INVALID, &m->polkit_registry, error);
+}
+
+uint64_t manager_bus_n_queued_write(Manager *m) {
+ uint64_t c = 0;
+ sd_bus *b;
+ int r;
+
+ /* Returns the total number of messages queued for writing on all our direct and API buses. */
+
+ SET_FOREACH(b, m->private_buses) {
+ uint64_t k;
+
+ r = sd_bus_get_n_queued_write(b, &k);
+ if (r < 0)
+ log_debug_errno(r, "Failed to query queued messages for private bus: %m");
+ else
+ c += k;
+ }
+
+ if (m->api_bus) {
+ uint64_t k;
+
+ r = sd_bus_get_n_queued_write(m->api_bus, &k);
+ if (r < 0)
+ log_debug_errno(r, "Failed to query queued messages for API bus: %m");
+ else
+ c += k;
+ }
+
+ return c;
+}
+
+static void vtable_dump_bus_properties(FILE *f, const sd_bus_vtable *table) {
+ const sd_bus_vtable *i;
+
+ for (i = table; i->type != _SD_BUS_VTABLE_END; i++) {
+ if (!IN_SET(i->type, _SD_BUS_VTABLE_PROPERTY, _SD_BUS_VTABLE_WRITABLE_PROPERTY) ||
+ (i->flags & (SD_BUS_VTABLE_DEPRECATED | SD_BUS_VTABLE_HIDDEN)) != 0)
+ continue;
+
+ fprintf(f, "%s\n", i->x.property.member);
+ }
+}
+
+void dump_bus_properties(FILE *f) {
+ assert(f);
+
+ vtable_dump_bus_properties(f, bus_automount_vtable);
+ vtable_dump_bus_properties(f, bus_cgroup_vtable);
+ vtable_dump_bus_properties(f, bus_device_vtable);
+ vtable_dump_bus_properties(f, bus_exec_vtable);
+ vtable_dump_bus_properties(f, bus_job_vtable);
+ vtable_dump_bus_properties(f, bus_kill_vtable);
+ vtable_dump_bus_properties(f, bus_manager_vtable);
+ vtable_dump_bus_properties(f, bus_mount_vtable);
+ vtable_dump_bus_properties(f, bus_path_vtable);
+ vtable_dump_bus_properties(f, bus_scope_vtable);
+ vtable_dump_bus_properties(f, bus_service_vtable);
+ vtable_dump_bus_properties(f, bus_slice_vtable);
+ vtable_dump_bus_properties(f, bus_socket_vtable);
+ vtable_dump_bus_properties(f, bus_swap_vtable);
+ vtable_dump_bus_properties(f, bus_target_vtable);
+ vtable_dump_bus_properties(f, bus_timer_vtable);
+ vtable_dump_bus_properties(f, bus_unit_vtable);
+ vtable_dump_bus_properties(f, bus_unit_cgroup_vtable);
+}
diff --git a/src/core/dbus.h b/src/core/dbus.h
new file mode 100644
index 0000000..50e7bb4
--- /dev/null
+++ b/src/core/dbus.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int bus_send_pending_reload_message(Manager *m);
+
+int bus_init_private(Manager *m);
+int bus_init_api(Manager *m);
+int bus_init_system(Manager *m);
+
+void bus_done_private(Manager *m);
+void bus_done_api(Manager *m);
+void bus_done_system(Manager *m);
+void bus_done(Manager *m);
+
+int bus_fdset_add_all(Manager *m, FDSet *fds);
+
+void bus_track_serialize(sd_bus_track *t, FILE *f, const char *prefix);
+int bus_track_coldplug(Manager *m, sd_bus_track **t, bool recursive, char **l);
+
+int bus_foreach_bus(Manager *m, sd_bus_track *subscribed2, int (*send_message)(sd_bus *bus, void *userdata), void *userdata);
+
+int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+int bus_verify_bypass_dump_ratelimit_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+
+int bus_forward_agent_released(Manager *m, const char *path);
+
+uint64_t manager_bus_n_queued_write(Manager *m);
+
+void dump_bus_properties(FILE *f);
+int bus_manager_introspect_implementations(FILE *out, const char *pattern);
diff --git a/src/core/device.c b/src/core/device.c
new file mode 100644
index 0000000..6b2d7c3
--- /dev/null
+++ b/src/core/device.c
@@ -0,0 +1,1301 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "dbus-device.h"
+#include "dbus-unit.h"
+#include "device-private.h"
+#include "device-util.h"
+#include "device.h"
+#include "log.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "ratelimit.h"
+#include "serialize.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "swap.h"
+#include "udev-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_DEVICE_STATE_MAX] = {
+ [DEVICE_DEAD] = UNIT_INACTIVE,
+ [DEVICE_TENTATIVE] = UNIT_ACTIVATING,
+ [DEVICE_PLUGGED] = UNIT_ACTIVE,
+};
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata);
+
+static int device_by_path(Manager *m, const char *path, Unit **ret) {
+ _cleanup_free_ char *e = NULL;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(path);
+
+ r = unit_name_from_path(path, ".device", &e);
+ if (r < 0)
+ return r;
+
+ u = manager_get_unit(m, e);
+ if (!u)
+ return -ENOENT;
+
+ if (ret)
+ *ret = u;
+ return 0;
+}
+
+static void device_unset_sysfs(Device *d) {
+ Hashmap *devices;
+
+ assert(d);
+
+ if (!d->sysfs)
+ return;
+
+ /* Remove this unit from the chain of devices which share the same sysfs path. */
+
+ devices = UNIT(d)->manager->devices_by_sysfs;
+
+ if (d->same_sysfs_prev)
+ /* If this is not the first unit, then simply remove this unit. */
+ d->same_sysfs_prev->same_sysfs_next = d->same_sysfs_next;
+ else if (d->same_sysfs_next)
+ /* If this is the first unit, replace with the next unit. */
+ assert_se(hashmap_replace(devices, d->same_sysfs_next->sysfs, d->same_sysfs_next) >= 0);
+ else
+ /* Otherwise, remove the entry. */
+ hashmap_remove(devices, d->sysfs);
+
+ if (d->same_sysfs_next)
+ d->same_sysfs_next->same_sysfs_prev = d->same_sysfs_prev;
+
+ d->same_sysfs_prev = d->same_sysfs_next = NULL;
+
+ d->sysfs = mfree(d->sysfs);
+}
+
+static int device_set_sysfs(Device *d, const char *sysfs) {
+ _cleanup_free_ char *copy = NULL;
+ Device *first;
+ int r;
+
+ assert(d);
+
+ if (streq_ptr(d->sysfs, sysfs))
+ return 0;
+
+ r = hashmap_ensure_allocated(&UNIT(d)->manager->devices_by_sysfs, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ copy = strdup(sysfs);
+ if (!copy)
+ return -ENOMEM;
+
+ device_unset_sysfs(d);
+
+ first = hashmap_get(UNIT(d)->manager->devices_by_sysfs, sysfs);
+ LIST_PREPEND(same_sysfs, first, d);
+
+ r = hashmap_replace(UNIT(d)->manager->devices_by_sysfs, copy, first);
+ if (r < 0) {
+ LIST_REMOVE(same_sysfs, first, d);
+ return r;
+ }
+
+ d->sysfs = TAKE_PTR(copy);
+ unit_add_to_dbus_queue(UNIT(d));
+
+ return 0;
+}
+
+static void device_init(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(UNIT(d)->load_state == UNIT_STUB);
+
+ /* In contrast to all other unit types we timeout jobs waiting
+ * for devices by default. This is because they otherwise wait
+ * indefinitely for plugged in devices, something which cannot
+ * happen for the other units since their operations time out
+ * anyway. */
+ u->job_running_timeout = u->manager->defaults.device_timeout_usec;
+
+ u->ignore_on_isolate = true;
+
+ d->deserialized_state = _DEVICE_STATE_INVALID;
+}
+
+static void device_done(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+
+ device_unset_sysfs(d);
+ d->deserialized_sysfs = mfree(d->deserialized_sysfs);
+ d->wants_property = strv_free(d->wants_property);
+ d->path = mfree(d->path);
+}
+
+static int device_load(Unit *u) {
+ int r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (!u->description) {
+ /* Generate a description based on the path, to be used until the device is initialized
+ properly */
+ r = unit_name_to_path(u->id, &u->description);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to unescape name: %m");
+ }
+
+ return 0;
+}
+
+static void device_set_state(Device *d, DeviceState state) {
+ DeviceState old_state;
+
+ assert(d);
+
+ if (d->state != state)
+ bus_unit_send_pending_change_signal(UNIT(d), false);
+
+ old_state = d->state;
+ d->state = state;
+
+ if (state == DEVICE_DEAD)
+ device_unset_sysfs(d);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(d), "Changed %s -> %s", device_state_to_string(old_state), device_state_to_string(state));
+
+ unit_notify(UNIT(d), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void device_found_changed(Device *d, DeviceFound previous, DeviceFound now) {
+ assert(d);
+
+ /* Didn't exist before, but does now? if so, generate a new invocation ID for it */
+ if (previous == DEVICE_NOT_FOUND && now != DEVICE_NOT_FOUND)
+ (void) unit_acquire_invocation_id(UNIT(d));
+
+ if (FLAGS_SET(now, DEVICE_FOUND_UDEV))
+ /* When the device is known to udev we consider it plugged. */
+ device_set_state(d, DEVICE_PLUGGED);
+ else if (now != DEVICE_NOT_FOUND && !FLAGS_SET(previous, DEVICE_FOUND_UDEV))
+ /* If the device has not been seen by udev yet, but is now referenced by the kernel, then we assume the
+ * kernel knows it now, and udev might soon too. */
+ device_set_state(d, DEVICE_TENTATIVE);
+ else
+ /* If nobody sees the device, or if the device was previously seen by udev and now is only referenced
+ * from the kernel, then we consider the device is gone, the kernel just hasn't noticed it yet. */
+ device_set_state(d, DEVICE_DEAD);
+}
+
+static void device_update_found_one(Device *d, DeviceFound found, DeviceFound mask) {
+ assert(d);
+
+ if (MANAGER_IS_RUNNING(UNIT(d)->manager)) {
+ DeviceFound n, previous;
+
+ /* When we are already running, then apply the new mask right-away, and trigger state changes
+ * right-away */
+
+ n = (d->found & ~mask) | (found & mask);
+ if (n == d->found)
+ return;
+
+ previous = d->found;
+ d->found = n;
+
+ device_found_changed(d, previous, n);
+ } else
+ /* We aren't running yet, let's apply the new mask to the shadow variable instead, which we'll apply as
+ * soon as we catch-up with the state. */
+ d->enumerated_found = (d->enumerated_found & ~mask) | (found & mask);
+}
+
+static void device_update_found_by_sysfs(Manager *m, const char *sysfs, DeviceFound found, DeviceFound mask) {
+ Device *l;
+
+ assert(m);
+ assert(sysfs);
+
+ if (mask == 0)
+ return;
+
+ l = hashmap_get(m->devices_by_sysfs, sysfs);
+ LIST_FOREACH(same_sysfs, d, l)
+ device_update_found_one(d, found, mask);
+}
+
+static void device_update_found_by_name(Manager *m, const char *path, DeviceFound found, DeviceFound mask) {
+ Unit *u;
+
+ assert(m);
+ assert(path);
+
+ if (mask == 0)
+ return;
+
+ if (device_by_path(m, path, &u) < 0)
+ return;
+
+ device_update_found_one(DEVICE(u), found, mask);
+}
+
+static int device_coldplug(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(d->state == DEVICE_DEAD);
+
+ /* First, let's put the deserialized state and found mask into effect, if we have it. */
+ if (d->deserialized_state < 0)
+ return 0;
+
+ Manager *m = u->manager;
+ DeviceFound found = d->deserialized_found;
+ DeviceState state = d->deserialized_state;
+
+ /* On initial boot, switch-root, reload, reexecute, the following happen:
+ * 1. MANAGER_IS_RUNNING() == false
+ * 2. enumerate devices: manager_enumerate() -> device_enumerate()
+ * Device.enumerated_found is set.
+ * 3. deserialize devices: manager_deserialize() -> device_deserialize_item()
+ * Device.deserialize_state and Device.deserialized_found are set.
+ * 4. coldplug devices: manager_coldplug() -> device_coldplug()
+ * deserialized properties are copied to the main properties.
+ * 5. MANAGER_IS_RUNNING() == true: manager_ready()
+ * 6. catchup devices: manager_catchup() -> device_catchup()
+ * Device.enumerated_found is applied to Device.found, and state is updated based on that.
+ *
+ * Notes:
+ * - On initial boot, no udev database exists. Hence, no devices are enumerated in the step 2.
+ * Also, there is no deserialized device. Device units are (a) generated based on dependencies of
+ * other units, or (b) generated when uevents are received.
+ *
+ * - On switch-root, the udev database may be cleared, except for devices with sticky bit, i.e.
+ * OPTIONS="db_persist". Hence, almost no devices are enumerated in the step 2. However, in
+ * general, we have several serialized devices. So, DEVICE_FOUND_UDEV bit in the
+ * Device.deserialized_found must be ignored, as udev rules in initrd and the main system are often
+ * different. If the deserialized state is DEVICE_PLUGGED, we need to downgrade it to
+ * DEVICE_TENTATIVE. Unlike the other starting mode, MANAGER_IS_SWITCHING_ROOT() is true when
+ * device_coldplug() and device_catchup() are called. Hence, let's conditionalize the operations by
+ * using the flag. After switch-root, systemd-udevd will (re-)process all devices, and the
+ * Device.found and Device.state will be adjusted.
+ *
+ * - On reload or reexecute, we can trust Device.enumerated_found, Device.deserialized_found, and
+ * Device.deserialized_state. Of course, deserialized parameters may be outdated, but the unit
+ * state can be adjusted later by device_catchup() or uevents. */
+
+ if (MANAGER_IS_SWITCHING_ROOT(m) &&
+ !FLAGS_SET(d->enumerated_found, DEVICE_FOUND_UDEV)) {
+
+ /* The device has not been enumerated. On switching-root, such situation is natural. See the
+ * above comment. To prevent problematic state transition active → dead → active, let's
+ * drop the DEVICE_FOUND_UDEV flag and downgrade state to DEVICE_TENTATIVE(activating). See
+ * issue #12953 and #23208. */
+ found &= ~DEVICE_FOUND_UDEV;
+ if (state == DEVICE_PLUGGED)
+ state = DEVICE_TENTATIVE;
+
+ /* Also check the validity of the device syspath. Without this check, if the device was
+ * removed while switching root, it would never go to inactive state, as both Device.found
+ * and Device.enumerated_found do not have the DEVICE_FOUND_UDEV flag, so device_catchup() in
+ * device_update_found_one() does nothing in most cases. See issue #25106. Note that the
+ * syspath field is only serialized when systemd is sufficiently new and the device has been
+ * already processed by udevd. */
+ if (d->deserialized_sysfs) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+ if (sd_device_new_from_syspath(&dev, d->deserialized_sysfs) < 0)
+ state = DEVICE_DEAD;
+ }
+ }
+
+ if (d->found == found && d->state == state)
+ return 0;
+
+ d->found = found;
+ device_set_state(d, state);
+ return 0;
+}
+
+static void device_catchup(Unit *u) {
+ Device *d = DEVICE(u);
+
+ assert(d);
+
+ /* Second, let's update the state with the enumerated state */
+ device_update_found_one(d, d->enumerated_found, DEVICE_FOUND_MASK);
+}
+
+static const struct {
+ DeviceFound flag;
+ const char *name;
+} device_found_map[] = {
+ { DEVICE_FOUND_UDEV, "found-udev" },
+ { DEVICE_FOUND_MOUNT, "found-mount" },
+ { DEVICE_FOUND_SWAP, "found-swap" },
+};
+
+static int device_found_to_string_many(DeviceFound flags, char **ret) {
+ _cleanup_free_ char *s = NULL;
+
+ assert(ret);
+
+ for (size_t i = 0; i < ELEMENTSOF(device_found_map); i++) {
+ if (!FLAGS_SET(flags, device_found_map[i].flag))
+ continue;
+
+ if (!strextend_with_separator(&s, ",", device_found_map[i].name))
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(s);
+
+ return 0;
+}
+
+static int device_found_from_string_many(const char *name, DeviceFound *ret) {
+ DeviceFound flags = 0;
+ int r;
+
+ assert(ret);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ DeviceFound f = 0;
+ unsigned i;
+
+ r = extract_first_word(&name, &word, ",", 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (i = 0; i < ELEMENTSOF(device_found_map); i++)
+ if (streq(word, device_found_map[i].name)) {
+ f = device_found_map[i].flag;
+ break;
+ }
+
+ if (f == 0)
+ return -EINVAL;
+
+ flags |= f;
+ }
+
+ *ret = flags;
+ return 0;
+}
+
+static int device_serialize(Unit *u, FILE *f, FDSet *fds) {
+ _cleanup_free_ char *s = NULL;
+ Device *d = DEVICE(u);
+
+ assert(d);
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ if (d->sysfs)
+ (void) serialize_item(f, "sysfs", d->sysfs);
+
+ if (d->path)
+ (void) serialize_item(f, "path", d->path);
+
+ (void) serialize_item(f, "state", device_state_to_string(d->state));
+
+ if (device_found_to_string_many(d->found, &s) >= 0)
+ (void) serialize_item(f, "found", s);
+
+ return 0;
+}
+
+static int device_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Device *d = DEVICE(u);
+ int r;
+
+ assert(d);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "sysfs")) {
+ if (!d->deserialized_sysfs) {
+ d->deserialized_sysfs = strdup(value);
+ if (!d->deserialized_sysfs)
+ log_oom_debug();
+ }
+
+ } else if (streq(key, "path")) {
+ if (!d->path) {
+ d->path = strdup(value);
+ if (!d->path)
+ log_oom_debug();
+ }
+
+ } else if (streq(key, "state")) {
+ DeviceState state;
+
+ state = device_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value, ignoring: %s", value);
+ else
+ d->deserialized_state = state;
+
+ } else if (streq(key, "found")) {
+ r = device_found_from_string_many(value, &d->deserialized_found);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse found value '%s', ignoring: %m", value);
+
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void device_dump(Unit *u, FILE *f, const char *prefix) {
+ Device *d = DEVICE(u);
+ _cleanup_free_ char *s = NULL;
+
+ assert(d);
+
+ (void) device_found_to_string_many(d->found, &s);
+
+ fprintf(f,
+ "%sDevice State: %s\n"
+ "%sDevice Path: %s\n"
+ "%sSysfs Path: %s\n"
+ "%sFound: %s\n",
+ prefix, device_state_to_string(d->state),
+ prefix, strna(d->path),
+ prefix, strna(d->sysfs),
+ prefix, strna(s));
+
+ STRV_FOREACH(i, d->wants_property)
+ fprintf(f, "%sudev SYSTEMD_WANTS: %s\n",
+ prefix, *i);
+}
+
+static UnitActiveState device_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[DEVICE(u)->state];
+}
+
+static const char *device_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return device_state_to_string(DEVICE(u)->state);
+}
+
+static int device_update_description(Unit *u, sd_device *dev, const char *path) {
+ _cleanup_free_ char *j = NULL;
+ const char *model, *label, *desc;
+ int r;
+
+ assert(u);
+ assert(path);
+
+ desc = path;
+
+ if (dev && device_get_model_string(dev, &model) >= 0) {
+ desc = model;
+
+ /* Try to concatenate the device model string with a label, if there is one */
+ if (sd_device_get_property_value(dev, "ID_FS_LABEL", &label) >= 0 ||
+ sd_device_get_property_value(dev, "ID_PART_ENTRY_NAME", &label) >= 0 ||
+ sd_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER", &label) >= 0) {
+
+ desc = j = strjoin(model, " ", label);
+ if (!j)
+ return log_oom();
+ }
+ }
+
+ r = unit_set_description(u, desc);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set device description: %m");
+
+ return 0;
+}
+
+static int device_add_udev_wants(Unit *u, sd_device *dev) {
+ _cleanup_strv_free_ char **added = NULL;
+ const char *wants, *property;
+ Device *d = DEVICE(u);
+ int r;
+
+ assert(d);
+ assert(dev);
+
+ property = MANAGER_IS_USER(u->manager) ? "SYSTEMD_USER_WANTS" : "SYSTEMD_WANTS";
+
+ r = sd_device_get_property_value(dev, property, &wants);
+ if (r < 0)
+ return 0;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&wants, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to parse property %s with value %s: %m", property, wants);
+
+ if (unit_name_is_valid(word, UNIT_NAME_TEMPLATE) && d->sysfs) {
+ _cleanup_free_ char *escaped = NULL;
+
+ /* If the unit name is specified as template, then automatically fill in the sysfs path of the
+ * device as instance name, properly escaped. */
+
+ r = unit_name_path_escape(d->sysfs, &escaped);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to escape %s: %m", d->sysfs);
+
+ r = unit_name_replace_instance(word, escaped, &k);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to build %s instance of template %s: %m", escaped, word);
+ } else {
+ /* If this is not a template, then let's mangle it so, that it becomes a valid unit name. */
+
+ r = unit_name_mangle(word, UNIT_NAME_MANGLE_WARN, &k);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to mangle unit name \"%s\": %m", word);
+ }
+
+ r = unit_add_dependency_by_name(u, UNIT_WANTS, k, true, UNIT_DEPENDENCY_UDEV);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to add Wants= dependency: %m");
+
+ r = strv_consume(&added, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (d->state != DEVICE_DEAD)
+ /* So here's a special hack, to compensate for the fact that the udev database's reload cycles are not
+ * synchronized with our own reload cycles: when we detect that the SYSTEMD_WANTS property of a device
+ * changes while the device unit is already up, let's skip to trigger units that were already listed
+ * and are active, and start units otherwise. This typically happens during the boot-time switch root
+ * transition, as udev devices will generally already be up in the initrd, but SYSTEMD_WANTS properties
+ * get then added through udev rules only available on the host system, and thus only when the initial
+ * udev coldplug trigger runs.
+ *
+ * We do this only if the device has been up already when we parse this, as otherwise the usual
+ * dependency logic that is run from the dead → plugged transition will trigger these deps. */
+ STRV_FOREACH(i, added) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ if (strv_contains(d->wants_property, *i)) {
+ Unit *v;
+
+ v = manager_get_unit(u->manager, *i);
+ if (v && UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(v)))
+ continue; /* The unit was already listed and is running. */
+ }
+
+ r = manager_add_job_by_name(u->manager, JOB_START, *i, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_full_errno(u, sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_UNIT) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to enqueue %s job, ignoring: %s", property, bus_error_message(&error, r));
+ }
+
+ return strv_free_and_replace(d->wants_property, added);
+}
+
+static bool device_is_bound_by_mounts(Device *d, sd_device *dev) {
+ int r;
+
+ assert(d);
+ assert(dev);
+
+ r = device_get_property_bool(dev, "SYSTEMD_MOUNT_DEVICE_BOUND");
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_MOUNT_DEVICE_BOUND= udev property, ignoring: %m");
+
+ d->bind_mounts = r > 0;
+
+ return d->bind_mounts;
+}
+
+static void device_upgrade_mount_deps(Unit *u) {
+ Unit *other;
+ void *v;
+ int r;
+
+ /* Let's upgrade Requires= to BindsTo= on us. (Used when SYSTEMD_MOUNT_DEVICE_BOUND is set) */
+
+ HASHMAP_FOREACH_KEY(v, other, unit_get_dependencies(u, UNIT_REQUIRED_BY)) {
+ if (other->type != UNIT_MOUNT)
+ continue;
+
+ r = unit_add_dependency(other, UNIT_BINDS_TO, u, true, UNIT_DEPENDENCY_UDEV);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to add BindsTo= dependency between device and mount unit, ignoring: %m");
+ }
+}
+
+static int device_setup_unit(Manager *m, sd_device *dev, const char *path, bool main, Set **units) {
+ _cleanup_(unit_freep) Unit *new_unit = NULL;
+ _cleanup_free_ char *e = NULL;
+ const char *sysfs = NULL;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(path);
+
+ if (dev) {
+ r = sd_device_get_syspath(dev, &sysfs);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+ }
+
+ r = unit_name_from_path(path, ".device", &e);
+ if (r < 0)
+ return log_struct_errno(
+ LOG_WARNING, r,
+ "MESSAGE_ID=" SD_MESSAGE_DEVICE_PATH_NOT_SUITABLE_STR,
+ "DEVICE=%s", path,
+ LOG_MESSAGE("Failed to generate valid unit name from device path '%s', ignoring device: %m",
+ path));
+
+ u = manager_get_unit(m, e);
+ if (u) {
+ /* The device unit can still be present even if the device was unplugged: a mount unit can reference it
+ * hence preventing the GC to have garbaged it. That's desired since the device unit may have a
+ * dependency on the mount unit which was added during the loading of the later. When the device is
+ * plugged the sysfs might not be initialized yet, as we serialize the device's state but do not
+ * serialize the sysfs path across reloads/reexecs. Hence, when coming back from a reload/restart we
+ * might have the state valid, but not the sysfs path. Also, there is another possibility; when multiple
+ * devices have the same devlink (e.g. /dev/disk/by-uuid/xxxx), adding/updating/removing one of the
+ * device causes syspath change. Hence, let's always update sysfs path. */
+
+ /* Let's remove all dependencies generated due to udev properties. We'll re-add whatever is configured
+ * now below. */
+ unit_remove_dependencies(u, UNIT_DEPENDENCY_UDEV);
+
+ } else {
+ r = unit_new_for_name(m, sizeof(Device), e, &new_unit);
+ if (r < 0)
+ return log_device_error_errno(dev, r, "Failed to allocate device unit %s: %m", e);
+
+ u = new_unit;
+
+ unit_add_to_load_queue(u);
+ }
+
+ if (!DEVICE(u)->path) {
+ DEVICE(u)->path = strdup(path);
+ if (!DEVICE(u)->path)
+ return log_oom();
+ }
+
+ /* If this was created via some dependency and has not actually been seen yet ->sysfs will not be
+ * initialized. Hence initialize it if necessary. */
+ if (sysfs) {
+ r = device_set_sysfs(DEVICE(u), sysfs);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set sysfs path %s: %m", sysfs);
+
+ /* The additional systemd udev properties we only interpret for the main object */
+ if (main)
+ (void) device_add_udev_wants(u, dev);
+ }
+
+ (void) device_update_description(u, dev, path);
+
+ /* So the user wants the mount units to be bound to the device but a mount unit might has been seen
+ * by systemd before the device appears on its radar. In this case the device unit is partially
+ * initialized and includes the deps on the mount unit but at that time the "bind mounts" flag wasn't
+ * present. Fix this up now. */
+ if (dev && device_is_bound_by_mounts(DEVICE(u), dev))
+ device_upgrade_mount_deps(u);
+
+ if (units) {
+ r = set_ensure_put(units, NULL, DEVICE(u));
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to store unit: %m");
+ }
+
+ TAKE_PTR(new_unit);
+ return 0;
+}
+
+static bool device_is_ready(sd_device *dev) {
+ int r;
+
+ assert(dev);
+
+ if (device_for_action(dev, SD_DEVICE_REMOVE))
+ return false;
+
+ r = device_is_renaming(dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to check if device is renaming, assuming device is not renaming: %m");
+ if (r > 0) {
+ log_device_debug(dev, "Device busy: device is renaming");
+ return false;
+ }
+
+ /* Is it really tagged as 'systemd' right now? */
+ r = sd_device_has_current_tag(dev, "systemd");
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to check if device has \"systemd\" tag, assuming device is not tagged with \"systemd\": %m");
+ if (r == 0)
+ log_device_debug(dev, "Device busy: device is not tagged with \"systemd\"");
+ if (r <= 0)
+ return false;
+
+ r = device_get_property_bool(dev, "SYSTEMD_READY");
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to get device SYSTEMD_READY property, assuming device does not have \"SYSTEMD_READY\" property: %m");
+ if (r == 0)
+ log_device_debug(dev, "Device busy: SYSTEMD_READY property from device is false");
+
+ return r != 0;
+}
+
+static int device_setup_devlink_unit_one(Manager *m, const char *devlink, Set **ready_units, Set **not_ready_units) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ Unit *u;
+
+ assert(m);
+ assert(devlink);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ if (sd_device_new_from_devname(&dev, devlink) >= 0 && device_is_ready(dev))
+ return device_setup_unit(m, dev, devlink, /* main = */ false, ready_units);
+
+ /* the devlink is already removed or not ready */
+ if (device_by_path(m, devlink, &u) < 0)
+ return 0; /* The corresponding .device unit not found. That's fine. */
+
+ return set_ensure_put(not_ready_units, NULL, DEVICE(u));
+}
+
+static int device_setup_extra_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+ _cleanup_strv_free_ char **aliases = NULL;
+ const char *syspath, *devname = NULL;
+ Device *l;
+ int r;
+
+ assert(m);
+ assert(dev);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ r = sd_device_get_syspath(dev, &syspath);
+ if (r < 0)
+ return r;
+
+ (void) sd_device_get_devname(dev, &devname);
+
+ /* devlink units */
+ FOREACH_DEVICE_DEVLINK(dev, devlink) {
+ /* These are a kind of special devlink. They should be always unique, but neither persistent
+ * nor predictable. Hence, let's refuse them. See also the comments for alias units below. */
+ if (PATH_STARTSWITH_SET(devlink, "/dev/block/", "/dev/char/"))
+ continue;
+
+ (void) device_setup_devlink_unit_one(m, devlink, ready_units, not_ready_units);
+ }
+
+ if (device_is_ready(dev)) {
+ const char *s;
+
+ r = sd_device_get_property_value(dev, "SYSTEMD_ALIAS", &s);
+ if (r < 0 && r != -ENOENT)
+ log_device_warning_errno(dev, r, "Failed to get SYSTEMD_ALIAS property, ignoring: %m");
+ if (r >= 0) {
+ r = strv_split_full(&aliases, s, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to parse SYSTEMD_ALIAS property, ignoring: %m");
+ }
+ }
+
+ /* alias units */
+ STRV_FOREACH(alias, aliases) {
+ if (!path_is_absolute(*alias)) {
+ log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not an absolute path, ignoring.", *alias);
+ continue;
+ }
+
+ if (!path_is_safe(*alias)) {
+ log_device_warning(dev, "The alias \"%s\" specified in SYSTEMD_ALIAS is not safe, ignoring.", *alias);
+ continue;
+ }
+
+ /* Note, even if the devlink is not persistent, LVM expects /dev/block/ symlink units exist.
+ * To achieve that, they set the path to SYSTEMD_ALIAS. Hence, we cannot refuse aliases start
+ * with /dev/, unfortunately. */
+
+ (void) device_setup_unit(m, dev, *alias, /* main = */ false, ready_units);
+ }
+
+ l = hashmap_get(m->devices_by_sysfs, syspath);
+ LIST_FOREACH(same_sysfs, d, l) {
+ if (!d->path)
+ continue;
+
+ if (path_equal(d->path, syspath))
+ continue; /* This is the main unit. */
+
+ if (devname && path_equal(d->path, devname))
+ continue; /* This is the real device node. */
+
+ if (device_has_devlink(dev, d->path))
+ continue; /* The devlink was already processed in the above loop. */
+
+ if (strv_contains(aliases, d->path))
+ continue; /* This is already processed in the above, and ready. */
+
+ if (path_startswith(d->path, "/dev/"))
+ /* This is a devlink unit. Check existence and update syspath. */
+ (void) device_setup_devlink_unit_one(m, d->path, ready_units, not_ready_units);
+ else
+ /* This is an alias unit of dropped or not ready device. */
+ (void) set_ensure_put(not_ready_units, NULL, d);
+ }
+
+ return 0;
+}
+
+static int device_setup_units(Manager *m, sd_device *dev, Set **ready_units, Set **not_ready_units) {
+ const char *syspath, *devname = NULL;
+ int r;
+
+ assert(m);
+ assert(dev);
+ assert(ready_units);
+ assert(not_ready_units);
+
+ r = sd_device_get_syspath(dev, &syspath);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Couldn't get syspath from device, ignoring: %m");
+
+ /* First, process the main (that is, points to the syspath) and (real, not symlink) devnode units. */
+ if (device_for_action(dev, SD_DEVICE_REMOVE))
+ /* If the device is removed, the main and devnode units will be removed by
+ * device_update_found_by_sysfs() in device_dispatch_io(). Hence, it is not necessary to
+ * store them to not_ready_units, and we have nothing to do here.
+ *
+ * Note, still we need to process devlink units below, as a devlink previously points to this
+ * device may still exist and now point to another device node. That is, do not forget to
+ * call device_setup_extra_units(). */
+ ;
+ else if (device_is_ready(dev)) {
+ /* Add the main unit named after the syspath. If this one fails, don't bother with the rest,
+ * as this one shall be the main device unit the others just follow. (Compare with how
+ * device_following() is implemented, see below, which looks for the sysfs device.) */
+ r = device_setup_unit(m, dev, syspath, /* main = */ true, ready_units);
+ if (r < 0)
+ return r;
+
+ /* Add an additional unit for the device node */
+ if (sd_device_get_devname(dev, &devname) >= 0)
+ (void) device_setup_unit(m, dev, devname, /* main = */ false, ready_units);
+
+ } else {
+ Unit *u;
+
+ /* If the device exists but not ready, then save the units and unset udev bits later. */
+
+ if (device_by_path(m, syspath, &u) >= 0) {
+ r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+ }
+
+ if (sd_device_get_devname(dev, &devname) >= 0 &&
+ device_by_path(m, devname, &u) >= 0) {
+ r = set_ensure_put(not_ready_units, NULL, DEVICE(u));
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to store unit, ignoring: %m");
+ }
+ }
+
+ /* Next, add/update additional .device units point to aliases and symlinks. */
+ (void) device_setup_extra_units(m, dev, ready_units, not_ready_units);
+
+ /* Safety check: no unit should be in ready_units and not_ready_units simultaneously. */
+ Unit *u;
+ SET_FOREACH(u, *not_ready_units)
+ if (set_remove(*ready_units, u))
+ log_unit_error(u, "Cannot activate and deactivate the unit simultaneously. Deactivating.");
+
+ return 0;
+}
+
+static Unit *device_following(Unit *u) {
+ Device *d = DEVICE(u);
+ Device *first = NULL;
+
+ assert(d);
+
+ if (startswith(u->id, "sys-"))
+ return NULL;
+
+ /* Make everybody follow the unit that's named after the sysfs path */
+ LIST_FOREACH(same_sysfs, other, d->same_sysfs_next)
+ if (startswith(UNIT(other)->id, "sys-"))
+ return UNIT(other);
+
+ LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+ if (startswith(UNIT(other)->id, "sys-"))
+ return UNIT(other);
+
+ first = other;
+ }
+
+ return UNIT(first);
+}
+
+static int device_following_set(Unit *u, Set **_set) {
+ Device *d = DEVICE(u);
+ _cleanup_set_free_ Set *set = NULL;
+ int r;
+
+ assert(d);
+ assert(_set);
+
+ if (LIST_JUST_US(same_sysfs, d)) {
+ *_set = NULL;
+ return 0;
+ }
+
+ set = set_new(NULL);
+ if (!set)
+ return -ENOMEM;
+
+ LIST_FOREACH(same_sysfs, other, d->same_sysfs_next) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH_BACKWARDS(same_sysfs, other, d->same_sysfs_prev) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ *_set = TAKE_PTR(set);
+ return 1;
+}
+
+static void device_shutdown(Manager *m) {
+ assert(m);
+
+ m->device_monitor = sd_device_monitor_unref(m->device_monitor);
+ m->devices_by_sysfs = hashmap_free(m->devices_by_sysfs);
+}
+
+static void device_enumerate(Manager *m) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ int r;
+
+ assert(m);
+
+ if (!m->device_monitor) {
+ r = sd_device_monitor_new(&m->device_monitor);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate device monitor: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_filter_add_match_tag(m->device_monitor, "systemd");
+ if (r < 0) {
+ log_error_errno(r, "Failed to add udev tag match: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_attach_event(m->device_monitor, m->event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to attach event to device monitor: %m");
+ goto fail;
+ }
+
+ r = sd_device_monitor_start(m->device_monitor, device_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to start device monitor: %m");
+ goto fail;
+ }
+ }
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate device enumerator: %m");
+ goto fail;
+ }
+
+ r = sd_device_enumerator_add_match_tag(e, "systemd");
+ if (r < 0) {
+ log_error_errno(r, "Failed to set tag for device enumeration: %m");
+ goto fail;
+ }
+
+ FOREACH_DEVICE(e, dev) {
+ _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+ Device *d;
+
+ if (device_setup_units(m, dev, &ready_units, &not_ready_units) < 0)
+ continue;
+
+ SET_FOREACH(d, ready_units)
+ device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+ SET_FOREACH(d, not_ready_units)
+ device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+ }
+
+ return;
+
+fail:
+ device_shutdown(m);
+}
+
+static void device_propagate_reload(Manager *m, Device *d) {
+ int r;
+
+ assert(m);
+ assert(d);
+
+ if (d->state == DEVICE_DEAD)
+ return;
+
+ r = manager_propagate_reload(m, UNIT(d), JOB_REPLACE, NULL);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(d), r, "Failed to propagate reload, ignoring: %m");
+}
+
+static void device_remove_old_on_move(Manager *m, sd_device *dev) {
+ _cleanup_free_ char *syspath_old = NULL;
+ const char *devpath_old;
+ int r;
+
+ assert(m);
+ assert(dev);
+
+ r = sd_device_get_property_value(dev, "DEVPATH_OLD", &devpath_old);
+ if (r < 0)
+ return (void) log_device_debug_errno(dev, r, "Failed to get DEVPATH_OLD= property on 'move' uevent, ignoring: %m");
+
+ syspath_old = path_join("/sys", devpath_old);
+ if (!syspath_old)
+ return (void) log_oom();
+
+ device_update_found_by_sysfs(m, syspath_old, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+}
+
+static int device_dispatch_io(sd_device_monitor *monitor, sd_device *dev, void *userdata) {
+ _cleanup_set_free_ Set *ready_units = NULL, *not_ready_units = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ sd_device_action_t action;
+ const char *sysfs;
+ bool ready;
+ Device *d;
+ int r;
+
+ assert(dev);
+
+ log_device_uevent(dev, "Processing udev action");
+
+ r = sd_device_get_syspath(dev, &sysfs);
+ if (r < 0) {
+ log_device_warning_errno(dev, r, "Failed to get device syspath, ignoring: %m");
+ return 0;
+ }
+
+ r = sd_device_get_action(dev, &action);
+ if (r < 0) {
+ log_device_warning_errno(dev, r, "Failed to get udev action, ignoring: %m");
+ return 0;
+ }
+
+ log_device_debug(dev, "Got '%s' action on syspath '%s'.", device_action_to_string(action), sysfs);
+
+ if (action == SD_DEVICE_MOVE)
+ device_remove_old_on_move(m, dev);
+
+ /* When udevd failed to process the device, SYSTEMD_ALIAS or any other properties may contain invalid
+ * values. Let's refuse to handle the uevent. */
+ if (sd_device_get_property_value(dev, "UDEV_WORKER_FAILED", NULL) >= 0) {
+ int v;
+
+ if (device_get_property_int(dev, "UDEV_WORKER_ERRNO", &v) >= 0)
+ log_device_warning_errno(dev, v, "systemd-udevd failed to process the device, ignoring: %m");
+ else if (device_get_property_int(dev, "UDEV_WORKER_EXIT_STATUS", &v) >= 0)
+ log_device_warning(dev, "systemd-udevd failed to process the device with exit status %i, ignoring.", v);
+ else if (device_get_property_int(dev, "UDEV_WORKER_SIGNAL", &v) >= 0) {
+ const char *s;
+ (void) sd_device_get_property_value(dev, "UDEV_WORKER_SIGNAL_NAME", &s);
+ log_device_warning(dev, "systemd-udevd failed to process the device with signal %i(%s), ignoring.", v, strna(s));
+ } else
+ log_device_warning(dev, "systemd-udevd failed to process the device with unknown result, ignoring.");
+
+ return 0;
+ }
+
+ /* A change event can signal that a device is becoming ready, in particular if the device is using
+ * the SYSTEMD_READY logic in udev so we need to reach the else block of the following if, even for
+ * change events */
+ ready = device_is_ready(dev);
+
+ (void) device_setup_units(m, dev, &ready_units, &not_ready_units);
+
+ if (action == SD_DEVICE_REMOVE) {
+ r = swap_process_device_remove(m, dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to process swap device remove event, ignoring: %m");
+ } else if (ready) {
+ r = swap_process_device_new(m, dev);
+ if (r < 0)
+ log_device_warning_errno(dev, r, "Failed to process swap device new event, ignoring: %m");
+ }
+
+ if (!IN_SET(action, SD_DEVICE_ADD, SD_DEVICE_REMOVE, SD_DEVICE_MOVE))
+ SET_FOREACH(d, ready_units)
+ device_propagate_reload(m, d);
+
+ if (!set_isempty(ready_units))
+ manager_dispatch_load_queue(m);
+
+ if (action == SD_DEVICE_REMOVE)
+ /* If we get notified that a device was removed by udev, then it's completely gone, hence
+ * unset all found bits. Note this affects all .device units still point to the removed
+ * device. */
+ device_update_found_by_sysfs(m, sysfs, DEVICE_NOT_FOUND, DEVICE_FOUND_MASK);
+
+ /* These devices are found and ready now, set the udev found bit. Note, this is also necessary to do
+ * on remove uevent, as some devlinks may be updated and now point to other device nodes. */
+ SET_FOREACH(d, ready_units)
+ device_update_found_one(d, DEVICE_FOUND_UDEV, DEVICE_FOUND_UDEV);
+
+ /* These devices may be nominally around, but not ready for us. Hence unset the udev bit, but leave
+ * the rest around. This may be redundant for remove uevent, but should be harmless. */
+ SET_FOREACH(d, not_ready_units)
+ device_update_found_one(d, DEVICE_NOT_FOUND, DEVICE_FOUND_UDEV);
+
+ return 0;
+}
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask) {
+ int r;
+
+ assert(m);
+ assert(node);
+ assert(!FLAGS_SET(mask, DEVICE_FOUND_UDEV));
+
+ if (!udev_available())
+ return;
+
+ if (mask == 0)
+ return;
+
+ /* This is called whenever we find a device referenced in /proc/swaps or /proc/self/mounts. Such a device might
+ * be mounted/enabled at a time where udev has not finished probing it yet, and we thus haven't learned about
+ * it yet. In this case we will set the device unit to "tentative" state.
+ *
+ * This takes a pair of DeviceFound flags parameters. The 'mask' parameter is a bit mask that indicates which
+ * bits of 'found' to copy into the per-device DeviceFound flags field. Thus, this function may be used to set
+ * and unset individual bits in a single call, while merging partially with previous state. */
+
+ if ((found & mask) != 0) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+
+ /* If the device is known in the kernel and newly appeared, then we'll create a device unit for it,
+ * under the name referenced in /proc/swaps or /proc/self/mountinfo. But first, let's validate if
+ * everything is alright with the device node. Note that we're fine with missing device nodes,
+ * but not with badly set up ones. */
+
+ r = sd_device_new_from_devname(&dev, node);
+ if (r == -ENODEV)
+ log_debug("Could not find device for %s, continuing without device node", node);
+ else if (r < 0) {
+ /* Reduce log noise from nodes which are not device nodes by skipping EINVAL. */
+ if (r != -EINVAL)
+ log_error_errno(r, "Failed to open %s device, ignoring: %m", node);
+ return;
+ }
+
+ (void) device_setup_unit(m, dev, node, /* main = */ false, NULL); /* 'dev' may be NULL. */
+ }
+
+ /* Update the device unit's state, should it exist */
+ (void) device_update_found_by_name(m, node, found, mask);
+}
+
+bool device_shall_be_bound_by(Unit *device, Unit *u) {
+ assert(device);
+ assert(u);
+
+ if (u->type != UNIT_MOUNT)
+ return false;
+
+ return DEVICE(device)->bind_mounts;
+}
+
+const UnitVTable device_vtable = {
+ .object_size = sizeof(Device),
+ .sections =
+ "Unit\0"
+ "Device\0"
+ "Install\0",
+
+ .gc_jobs = true,
+
+ .init = device_init,
+ .done = device_done,
+ .load = device_load,
+
+ .coldplug = device_coldplug,
+ .catchup = device_catchup,
+
+ .serialize = device_serialize,
+ .deserialize_item = device_deserialize_item,
+
+ .dump = device_dump,
+
+ .active_state = device_active_state,
+ .sub_state_to_string = device_sub_state_to_string,
+
+ .following = device_following,
+ .following_set = device_following_set,
+
+ .enumerate = device_enumerate,
+ .shutdown = device_shutdown,
+ .supported = udev_available,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Expecting device %s...",
+ [1] = "Waiting for device %s to disappear...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Found device %s.",
+ [JOB_TIMEOUT] = "Timed out waiting for device %s.",
+ },
+ },
+};
diff --git a/src/core/device.h b/src/core/device.h
new file mode 100644
index 0000000..9dd6fb5
--- /dev/null
+++ b/src/core/device.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Device Device;
+
+/* A mask specifying where we have seen the device currently. This is a bitmask because the device might show up
+ * asynchronously from each other at various places. For example, in very common case a device might already be mounted
+ * before udev finished probing it (think: a script setting up a loopback block device, formatting it and mounting it
+ * in quick succession). Hence we need to track precisely where it is already visible and where not. */
+typedef enum DeviceFound {
+ DEVICE_NOT_FOUND = 0,
+ DEVICE_FOUND_UDEV = 1 << 0, /* The device has shown up in the udev database */
+ DEVICE_FOUND_MOUNT = 1 << 1, /* The device has shown up in /proc/self/mountinfo */
+ DEVICE_FOUND_SWAP = 1 << 2, /* The device has shown up in /proc/swaps */
+ DEVICE_FOUND_MASK = DEVICE_FOUND_UDEV|DEVICE_FOUND_MOUNT|DEVICE_FOUND_SWAP,
+} DeviceFound;
+
+struct Device {
+ Unit meta;
+
+ char *sysfs, *deserialized_sysfs;
+ char *path; /* syspath, device node, alias, or devlink */
+
+ /* In order to be able to distinguish dependencies on different device nodes we might end up creating multiple
+ * devices for the same sysfs path. We chain them up here. */
+ LIST_FIELDS(struct Device, same_sysfs);
+
+ DeviceState state, deserialized_state;
+ DeviceFound found, deserialized_found, enumerated_found;
+
+ bool bind_mounts;
+
+ /* The SYSTEMD_WANTS udev property for this device the last time we saw it */
+ char **wants_property;
+};
+
+extern const UnitVTable device_vtable;
+
+void device_found_node(Manager *m, const char *node, DeviceFound found, DeviceFound mask);
+bool device_shall_be_bound_by(Unit *device, Unit *u);
+
+DEFINE_CAST(DEVICE, Device);
diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c
new file mode 100644
index 0000000..12724c6
--- /dev/null
+++ b/src/core/dynamic-user.c
@@ -0,0 +1,871 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "clean-ipc.h"
+#include "dynamic-user.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "iovec-util.h"
+#include "lock-util.h"
+#include "nscd-flush.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "socket-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+/* Takes a value generated randomly or by hashing and turns it into a UID in the right range */
+#define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN)
+
+DEFINE_TRIVIAL_REF_FUNC(DynamicUser, dynamic_user);
+
+DynamicUser* dynamic_user_free(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ if (d->manager)
+ (void) hashmap_remove(d->manager->dynamic_users, d->name);
+
+ safe_close_pair(d->storage_socket);
+ return mfree(d);
+}
+
+static int dynamic_user_add(Manager *m, const char *name, int storage_socket[static 2], DynamicUser **ret) {
+ DynamicUser *d;
+ int r;
+
+ assert(m || ret);
+ assert(name);
+ assert(storage_socket);
+
+ if (m) { /* Might be called in sd-executor with no manager object */
+ r = hashmap_ensure_allocated(&m->dynamic_users, &string_hash_ops);
+ if (r < 0)
+ return r;
+ }
+
+ d = malloc0(offsetof(DynamicUser, name) + strlen(name) + 1);
+ if (!d)
+ return -ENOMEM;
+
+ strcpy(d->name, name);
+
+ d->storage_socket[0] = storage_socket[0];
+ d->storage_socket[1] = storage_socket[1];
+
+ if (m) { /* Might be called in sd-executor with no manager object */
+ r = hashmap_put(m->dynamic_users, d->name, d);
+ if (r < 0) {
+ free(d);
+ return r;
+ }
+ }
+
+ d->manager = m;
+
+ if (ret)
+ *ret = d;
+
+ return 0;
+}
+
+static int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) {
+ _cleanup_close_pair_ int storage_socket[2] = EBADF_PAIR;
+ DynamicUser *d;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ /* Return the DynamicUser structure for a specific user name. Note that this won't actually allocate a UID for
+ * it, but just prepare the data structure for it. The UID is allocated only on demand, when it's really
+ * needed, and in the child process we fork off, since allocation involves NSS checks which are not OK to do
+ * from PID 1. To allow the children and PID 1 share information about allocated UIDs we use an anonymous
+ * AF_UNIX/SOCK_DGRAM socket (called the "storage socket") that contains at most one datagram with the
+ * allocated UID number, plus an fd referencing the lock file for the UID
+ * (i.e. /run/systemd/dynamic-uid/$UID). Why involve the socket pair? So that PID 1 and all its children can
+ * share the same storage for the UID and lock fd, simply by inheriting the storage socket fds. The socket pair
+ * may exist in three different states:
+ *
+ * a) no datagram stored. This is the initial state. In this case the dynamic user was never realized.
+ *
+ * b) a datagram containing a UID stored, but no lock fd attached to it. In this case there was already a
+ * statically assigned UID by the same name, which we are reusing.
+ *
+ * c) a datagram containing a UID stored, and a lock fd is attached to it. In this case we allocated a dynamic
+ * UID and locked it in the file system, using the lock fd.
+ *
+ * As PID 1 and various children might access the socket pair simultaneously, and pop the datagram or push it
+ * back in any time, we also maintain a lock on the socket pair. Note one peculiarity regarding locking here:
+ * the UID lock on disk is protected via a BSD file lock (i.e. an fd-bound lock), so that the lock is kept in
+ * place as long as there's a reference to the fd open. The lock on the storage socket pair however is a POSIX
+ * file lock (i.e. a process-bound lock), as all users share the same fd of this (after all it is anonymous,
+ * nobody else could get any access to it except via our own fd) and we want to synchronize access between all
+ * processes that have access to it. */
+
+ d = hashmap_get(m->dynamic_users, name);
+ if (d) {
+ if (ret) {
+ /* We already have a structure for the dynamic user, let's increase the ref count and reuse it */
+ d->n_ref++;
+ *ret = d;
+ }
+ return 0;
+ }
+
+ if (!valid_user_group_name(name, VALID_USER_ALLOW_NUMERIC))
+ return -EINVAL;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, storage_socket) < 0)
+ return -errno;
+
+ r = dynamic_user_add(m, name, storage_socket, &d);
+ if (r < 0)
+ return r;
+
+ storage_socket[0] = storage_socket[1] = -EBADF;
+
+ if (ret) {
+ d->n_ref++;
+ *ret = d;
+ }
+
+ return 1;
+}
+
+static int make_uid_symlinks(uid_t uid, const char *name, bool b) {
+
+ char path1[STRLEN("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1];
+ const char *path2;
+ int r = 0, k;
+
+ /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The
+ * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it
+ * would be its own client then). We hence keep these world-readable symlinks in place, so that the
+ * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go
+ * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks
+ * on them and as those may be taken by any user with read access we can't make them world-readable. */
+
+ xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid);
+ if (unlink(path1) < 0 && errno != ENOENT)
+ r = -errno;
+
+ if (b && symlink(name, path1) < 0) {
+ k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path1);
+ if (r == 0)
+ r = k;
+ }
+
+ path2 = strjoina("/run/systemd/dynamic-uid/direct:", name);
+ if (unlink(path2) < 0 && errno != ENOENT) {
+ k = -errno;
+ if (r == 0)
+ r = k;
+ }
+
+ if (b && symlink(path1 + STRLEN("/run/systemd/dynamic-uid/direct:"), path2) < 0) {
+ k = log_warning_errno(errno, "Failed to symlink \"%s\": %m", path2);
+ if (r == 0)
+ r = k;
+ }
+
+ return r;
+}
+
+static int pick_uid(char **suggested_paths, const char *name, uid_t *ret_uid) {
+
+ /* Find a suitable free UID. We use the following strategy to find a suitable UID:
+ *
+ * 1. Initially, we try to read the UID of a number of specified paths. If any of these UIDs works, we use
+ * them. We use in order to increase the chance of UID reuse, if StateDirectory=, CacheDirectory= or
+ * LogsDirectory= are used, as reusing the UID these directories are owned by saves us from having to
+ * recursively chown() them to new users.
+ *
+ * 2. If that didn't yield a currently unused UID, we hash the user name, and try to use that. This should be
+ * pretty good, as the use ris by default derived from the unit name, and hence the same service and same
+ * user should usually get the same UID as long as our hashing doesn't clash.
+ *
+ * 3. Finally, if that didn't work, we randomly pick UIDs, until we find one that is empty.
+ *
+ * Since the dynamic UID space is relatively small we'll stop trying after 100 iterations, giving up. */
+
+ enum {
+ PHASE_SUGGESTED, /* the first phase, reusing directory ownership UIDs */
+ PHASE_HASHED, /* the second phase, deriving a UID from the username by hashing */
+ PHASE_RANDOM, /* the last phase, randomly picking UIDs */
+ } phase = PHASE_SUGGESTED;
+
+ static const uint8_t hash_key[] = {
+ 0x37, 0x53, 0x7e, 0x31, 0xcf, 0xce, 0x48, 0xf5,
+ 0x8a, 0xbb, 0x39, 0x57, 0x8d, 0xd9, 0xec, 0x59
+ };
+
+ unsigned n_tries = 100, current_suggested = 0;
+ int r;
+
+ (void) mkdir("/run/systemd/dynamic-uid", 0755);
+
+ for (;;) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+ _cleanup_close_ int lock_fd = -EBADF;
+ uid_t candidate;
+ ssize_t l;
+
+ if (--n_tries <= 0) /* Give up retrying eventually */
+ return -EBUSY;
+
+ switch (phase) {
+
+ case PHASE_SUGGESTED: {
+ struct stat st;
+
+ if (!suggested_paths || !suggested_paths[current_suggested]) {
+ /* We reached the end of the suggested paths list, let's try by hashing the name */
+ phase = PHASE_HASHED;
+ continue;
+ }
+
+ if (stat(suggested_paths[current_suggested++], &st) < 0)
+ continue; /* We can't read the UID of this path, but that doesn't matter, just try the next */
+
+ candidate = st.st_uid;
+ break;
+ }
+
+ case PHASE_HASHED:
+ /* A static user by this name does not exist yet. Let's find a free ID then, and use that. We
+ * start with a UID generated as hash from the user name. */
+ candidate = UID_CLAMP_INTO_RANGE(siphash24(name, strlen(name), hash_key));
+
+ /* If this one fails, we should proceed with random tries */
+ phase = PHASE_RANDOM;
+ break;
+
+ case PHASE_RANDOM:
+
+ /* Pick another random UID, and see if that works for us. */
+ random_bytes(&candidate, sizeof(candidate));
+ candidate = UID_CLAMP_INTO_RANGE(candidate);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Make sure whatever we picked here actually is in the right range */
+ if (!uid_is_dynamic(candidate))
+ continue;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, candidate);
+
+ for (;;) {
+ struct stat st;
+
+ lock_fd = open(lock_path, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600);
+ if (lock_fd < 0)
+ return -errno;
+
+ r = flock(lock_fd, LOCK_EX|LOCK_NB); /* Try to get a BSD file lock on the UID lock file */
+ if (r < 0) {
+ if (IN_SET(errno, EBUSY, EAGAIN))
+ goto next; /* already in use */
+
+ return -errno;
+ }
+
+ if (fstat(lock_fd, &st) < 0)
+ return -errno;
+ if (st.st_nlink > 0)
+ break;
+
+ /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and
+ * got the lock. Close it, and try again. */
+ lock_fd = safe_close(lock_fd);
+ }
+
+ /* Some superficial check whether this UID/GID might already be taken by some static user */
+ if (getpwuid(candidate) ||
+ getgrgid((gid_t) candidate) ||
+ search_ipc(candidate, (gid_t) candidate) != 0) {
+ (void) unlink(lock_path);
+ continue;
+ }
+
+ /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */
+ l = pwritev(lock_fd,
+ (struct iovec[2]) {
+ IOVEC_MAKE_STRING(name),
+ IOVEC_MAKE((char[1]) { '\n' }, 1),
+ }, 2, 0);
+ if (l < 0) {
+ r = -errno;
+ (void) unlink(lock_path);
+ return r;
+ }
+
+ (void) ftruncate(lock_fd, l);
+ (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */
+
+ *ret_uid = candidate;
+ return TAKE_FD(lock_fd);
+
+ next:
+ ;
+ }
+}
+
+static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) {
+ uid_t uid = UID_INVALID;
+ struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid));
+ int lock_fd;
+ ssize_t k;
+
+ assert(d);
+ assert(ret_uid);
+ assert(ret_lock_fd);
+
+ /* Read the UID and lock fd that is stored in the storage AF_UNIX socket. This should be called with
+ * the lock on the socket taken. */
+
+ k = receive_one_fd_iov(d->storage_socket[0], &iov, 1, MSG_DONTWAIT, &lock_fd);
+ if (k < 0)
+ return (int) k;
+
+ *ret_uid = uid;
+ *ret_lock_fd = lock_fd;
+
+ return 0;
+}
+
+static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) {
+ struct iovec iov = IOVEC_MAKE(&uid, sizeof(uid));
+
+ assert(d);
+
+ /* Store the UID and lock_fd in the storage socket. This should be called with the socket pair lock taken. */
+ return send_one_fd_iov(d->storage_socket[1], lock_fd, &iov, 1, MSG_DONTWAIT);
+}
+
+static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+
+ if (lock_fd < 0)
+ return;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+ (void) unlink(lock_path);
+
+ (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */
+}
+
+static int dynamic_user_realize(
+ DynamicUser *d,
+ char **suggested_dirs,
+ uid_t *ret_uid, gid_t *ret_gid,
+ bool is_user) {
+
+ _cleanup_close_ int uid_lock_fd = -EBADF;
+ _cleanup_close_ int etc_passwd_lock_fd = -EBADF;
+ uid_t num = UID_INVALID; /* a uid if is_user, and a gid otherwise */
+ gid_t gid = GID_INVALID; /* a gid if is_user, ignored otherwise */
+ bool flush_cache = false;
+ int r;
+
+ assert(d);
+ assert(is_user == !!ret_uid);
+ assert(ret_gid);
+
+ /* Acquire a UID for the user name. This will allocate a UID for the user name if the user doesn't exist
+ * yet. If it already exists its existing UID/GID will be reused. */
+
+ r = posix_lock(d->storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return r;
+
+ CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+ r = dynamic_user_pop(d, &num, &uid_lock_fd);
+ if (r < 0) {
+ int new_uid_lock_fd;
+ uid_t new_uid;
+
+ if (r != -EAGAIN)
+ return r;
+
+ /* OK, nothing stored yet, let's try to find something useful. While we are working on this release the
+ * lock however, so that nobody else blocks on our NSS lookups. */
+ r = posix_lock(d->storage_socket[0], LOCK_UN);
+ if (r < 0)
+ return r;
+
+ /* Let's see if a proper, static user or group by this name exists. Try to take the lock on
+ * /etc/passwd, if that fails with EROFS then /etc is read-only. In that case it's fine if we don't
+ * take the lock, given that users can't be added there anyway in this case. */
+ etc_passwd_lock_fd = take_etc_passwd_lock(NULL);
+ if (etc_passwd_lock_fd < 0 && etc_passwd_lock_fd != -EROFS)
+ return etc_passwd_lock_fd;
+
+ /* First, let's parse this as numeric UID */
+ r = parse_uid(d->name, &num);
+ if (r < 0) {
+ struct passwd *p;
+ struct group *g;
+
+ if (is_user) {
+ /* OK, this is not a numeric UID. Let's see if there's a user by this name */
+ p = getpwnam(d->name);
+ if (p) {
+ num = p->pw_uid;
+ gid = p->pw_gid;
+ } else {
+ /* if the user does not exist but the group with the same name exists, refuse operation */
+ g = getgrnam(d->name);
+ if (g)
+ return -EILSEQ;
+ }
+ } else {
+ /* Let's see if there's a group by this name */
+ g = getgrnam(d->name);
+ if (g)
+ num = (uid_t) g->gr_gid;
+ else {
+ /* if the group does not exist but the user with the same name exists, refuse operation */
+ p = getpwnam(d->name);
+ if (p)
+ return -EILSEQ;
+ }
+ }
+ }
+
+ if (num == UID_INVALID) {
+ /* No static UID assigned yet, excellent. Let's pick a new dynamic one, and lock it. */
+
+ uid_lock_fd = pick_uid(suggested_dirs, d->name, &num);
+ if (uid_lock_fd < 0)
+ return uid_lock_fd;
+ }
+
+ /* So, we found a working UID/lock combination. Let's see if we actually still need it. */
+ r = posix_lock(d->storage_socket[0], LOCK_EX);
+ if (r < 0) {
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ return r;
+ }
+
+ r = dynamic_user_pop(d, &new_uid, &new_uid_lock_fd);
+ if (r < 0) {
+ if (r != -EAGAIN) {
+ /* OK, something bad happened, let's get rid of the bits we acquired. */
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ return r;
+ }
+
+ /* Great! Nothing is stored here, still. Store our newly acquired data. */
+ flush_cache = true;
+ } else {
+ /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we
+ * acquired, and use what's stored now. */
+
+ unlink_uid_lock(uid_lock_fd, num, d->name);
+ safe_close(uid_lock_fd);
+
+ num = new_uid;
+ uid_lock_fd = new_uid_lock_fd;
+ }
+ } else if (is_user && !uid_is_dynamic(num)) {
+ struct passwd *p;
+
+ /* Statically allocated user may have different uid and gid. So, let's obtain the gid. */
+ errno = 0;
+ p = getpwuid(num);
+ if (!p)
+ return errno_or_else(ESRCH);
+
+ gid = p->pw_gid;
+ }
+
+ /* If the UID/GID was already allocated dynamically, push the data we popped out back in. If it was already
+ * allocated statically, push the UID back too, but do not push the lock fd in. If we allocated the UID
+ * dynamically right here, push that in along with the lock fd for it. */
+ r = dynamic_user_push(d, num, uid_lock_fd);
+ if (r < 0)
+ return r;
+
+ if (flush_cache) {
+ /* If we allocated a new dynamic UID, refresh nscd, so that it forgets about potentially cached
+ * negative entries. But let's do so after we release the /etc/passwd lock, so that there's no
+ * potential for nscd wanting to lock that for completing the invalidation. */
+ etc_passwd_lock_fd = safe_close(etc_passwd_lock_fd);
+ (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+ }
+
+ if (is_user) {
+ *ret_uid = num;
+ *ret_gid = gid != GID_INVALID ? gid : num;
+ } else
+ *ret_gid = num;
+
+ return 0;
+}
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret) {
+ _cleanup_close_ int lock_fd = -EBADF;
+ uid_t uid;
+ int r;
+
+ assert(d);
+
+ /* Get the currently assigned UID for the user, if there's any. This simply pops the data from the
+ * storage socket, and pushes it back in right-away. */
+
+ r = posix_lock(d->storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return r;
+
+ CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+ r = dynamic_user_pop(d, &uid, &lock_fd);
+ if (r < 0)
+ return r;
+
+ r = dynamic_user_push(d, uid, lock_fd);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ *ret = uid;
+
+ return 0;
+}
+
+static DynamicUser* dynamic_user_unref(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ /* Note that this doesn't actually release any resources itself. If a dynamic user should be fully
+ * destroyed and its UID released, use dynamic_user_destroy() instead. NB: the dynamic user table may
+ * contain entries with no references, which is commonly the case right before a daemon reload. */
+
+ assert(d->n_ref > 0);
+ d->n_ref--;
+
+ return NULL;
+}
+
+static int dynamic_user_close(DynamicUser *d) {
+ _cleanup_close_ int lock_fd = -EBADF;
+ uid_t uid;
+ int r;
+
+ /* Release the user ID, by releasing the lock on it, and emptying the storage socket. After this the
+ * user is unrealized again, much like it was after it the DynamicUser object was first allocated. */
+
+ r = posix_lock(d->storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return r;
+
+ CLEANUP_POSIX_UNLOCK(d->storage_socket[0]);
+
+ r = dynamic_user_pop(d, &uid, &lock_fd);
+ if (r == -EAGAIN)
+ /* User wasn't realized yet, nothing to do. */
+ return 0;
+ if (r < 0)
+ return r;
+
+ /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */
+ unlink_uid_lock(lock_fd, uid, d->name);
+
+ (void) nscd_flush_cache(STRV_MAKE("passwd", "group"));
+ return 1;
+}
+
+static DynamicUser* dynamic_user_destroy(DynamicUser *d) {
+ if (!d)
+ return NULL;
+
+ /* Drop a reference to a DynamicUser object, and destroy the user completely if this was the last
+ * reference. This is called whenever a service is shut down and wants its dynamic UID gone. Note that
+ * dynamic_user_unref() is what is called whenever a service is simply freed, for example during a reload
+ * cycle, where the dynamic users should not be destroyed, but our datastructures should. */
+
+ dynamic_user_unref(d);
+
+ if (d->n_ref > 0)
+ return NULL;
+
+ (void) dynamic_user_close(d);
+ return dynamic_user_free(d);
+}
+
+int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds) {
+ int copy0, copy1;
+
+ assert(key);
+ assert(f);
+ assert(fds);
+
+ if (!d)
+ return 0;
+
+ if (d->storage_socket[0] < 0 || d->storage_socket[1] < 0)
+ return 0;
+
+ copy0 = fdset_put_dup(fds, d->storage_socket[0]);
+ if (copy0 < 0)
+ return log_error_errno(copy0, "Failed to add dynamic user storage fd to serialization: %m");
+
+ copy1 = fdset_put_dup(fds, d->storage_socket[1]);
+ if (copy1 < 0)
+ return log_error_errno(copy1, "Failed to add dynamic user storage fd to serialization: %m");
+
+ (void) serialize_item_format(f, key, "%s %i %i", d->name, copy0, copy1);
+
+ return 0;
+}
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds) {
+ DynamicUser *d;
+
+ assert(m);
+
+ /* Dump the dynamic user database into the manager serialization, to deal with daemon reloads. */
+
+ HASHMAP_FOREACH(d, m->dynamic_users)
+ (void) dynamic_user_serialize_one(d, "dynamic-user", f, fds);
+
+ return 0;
+}
+
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret) {
+ _cleanup_free_ char *name = NULL, *s0 = NULL, *s1 = NULL;
+ _cleanup_close_ int fd0 = -EBADF, fd1 = -EBADF;
+ int r;
+
+ assert(value);
+ assert(fds);
+
+ /* Parse the serialization again, after a daemon reload */
+
+ r = extract_many_words(&value, NULL, 0, &name, &s0, &s1, NULL);
+ if (r != 3 || !isempty(value)) {
+ log_debug("Unable to parse dynamic user line.");
+ return;
+ }
+
+ fd0 = deserialize_fd(fds, s0);
+ if (fd0 < 0)
+ return;
+
+ fd1 = deserialize_fd(fds, s1);
+ if (fd1 < 0)
+ return;
+
+ r = dynamic_user_add(m, name, (int[]) { fd0, fd1 }, ret);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add dynamic user: %m");
+ return;
+ }
+
+ TAKE_FD(fd0);
+ TAKE_FD(fd1);
+
+ if (ret) /* If the caller uses it directly, increment the refcount */
+ (*ret)->n_ref++;
+}
+
+void dynamic_user_vacuum(Manager *m, bool close_user) {
+ DynamicUser *d;
+
+ assert(m);
+
+ /* Empty the dynamic user database, optionally cleaning up orphaned dynamic users, i.e. destroy and free users
+ * to which no reference exist. This is called after a daemon reload finished, in order to destroy users which
+ * might not be referenced anymore. */
+
+ HASHMAP_FOREACH(d, m->dynamic_users) {
+ if (d->n_ref > 0)
+ continue;
+
+ if (close_user) {
+ log_debug("Removing orphaned dynamic user %s", d->name);
+ (void) dynamic_user_close(d);
+ }
+
+ dynamic_user_free(d);
+ }
+}
+
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret) {
+ char lock_path[STRLEN("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+ _cleanup_free_ char *user = NULL;
+ uid_t check_uid;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* A friendly way to translate a dynamic user's UID into a name. */
+ if (!uid_is_dynamic(uid))
+ return -ESRCH;
+
+ xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid);
+ r = read_one_line_file(lock_path, &user);
+ if (IN_SET(r, -ENOENT, 0))
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ /* The lock file might be stale, hence let's verify the data before we return it */
+ r = dynamic_user_lookup_name(m, user, &check_uid);
+ if (r < 0)
+ return r;
+ if (check_uid != uid) /* lock file doesn't match our own idea */
+ return -ESRCH;
+
+ *ret = TAKE_PTR(user);
+
+ return 0;
+}
+
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret) {
+ DynamicUser *d;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ /* A friendly call for translating a dynamic user's name into its UID */
+
+ d = hashmap_get(m->dynamic_users, name);
+ if (!d)
+ return -ESRCH;
+
+ r = dynamic_user_current(d, ret);
+ if (r == -EAGAIN) /* not realized yet? */
+ return -ESRCH;
+
+ return r;
+}
+
+int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret) {
+ _cleanup_(dynamic_creds_unrefp) DynamicCreds *creds = NULL;
+ bool acquired = false;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ if (!user && !group) {
+ *ret = NULL;
+ return 0;
+ }
+
+ creds = new0(DynamicCreds, 1);
+ if (!creds)
+ return -ENOMEM;
+
+ /* A DynamicUser object encapsulates an allocation of both a UID and a GID for a specific name. However, some
+ * services use different user and groups. For cases like that there's DynamicCreds containing a pair of user
+ * and group. This call allocates a pair. */
+
+ if (user) {
+ r = dynamic_user_acquire(m, user, &creds->user);
+ if (r < 0)
+ return r;
+
+ acquired = true;
+ }
+
+ if (creds->user && (!group || streq_ptr(user, group)))
+ creds->group = dynamic_user_ref(creds->user);
+ else if (group) {
+ r = dynamic_user_acquire(m, group, &creds->group);
+ if (r < 0) {
+ if (acquired)
+ creds->user = dynamic_user_unref(creds->user);
+ return r;
+ }
+ }
+
+ *ret = TAKE_PTR(creds);
+
+ return 0;
+}
+
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid) {
+ uid_t u = UID_INVALID;
+ gid_t g = GID_INVALID;
+ int r;
+
+ assert(creds);
+ assert(uid);
+ assert(gid);
+
+ /* Realize both the referenced user and group */
+
+ if (creds->user) {
+ r = dynamic_user_realize(creds->user, suggested_paths, &u, &g, true);
+ if (r < 0)
+ return r;
+ }
+
+ if (creds->group && creds->group != creds->user) {
+ r = dynamic_user_realize(creds->group, suggested_paths, NULL, &g, false);
+ if (r < 0)
+ return r;
+ }
+
+ *uid = u;
+ *gid = g;
+ return 0;
+}
+
+DynamicCreds* dynamic_creds_unref(DynamicCreds *creds) {
+ if (!creds)
+ return NULL;
+
+ creds->user = dynamic_user_unref(creds->user);
+ creds->group = dynamic_user_unref(creds->group);
+
+ return mfree(creds);
+}
+
+DynamicCreds* dynamic_creds_destroy(DynamicCreds *creds) {
+ if (!creds)
+ return NULL;
+
+ creds->user = dynamic_user_destroy(creds->user);
+ creds->group = dynamic_user_destroy(creds->group);
+
+ return mfree(creds);
+}
+
+void dynamic_creds_done(DynamicCreds *creds) {
+ if (!creds)
+ return;
+
+ if (creds->group != creds->user)
+ dynamic_user_free(creds->group);
+ creds->group = creds->user = dynamic_user_free(creds->user);
+}
+
+void dynamic_creds_close(DynamicCreds *creds) {
+ if (!creds)
+ return;
+
+ if (creds->user)
+ safe_close_pair(creds->user->storage_socket);
+
+ if (creds->group && creds->group != creds->user)
+ safe_close_pair(creds->group->storage_socket);
+}
diff --git a/src/core/dynamic-user.h b/src/core/dynamic-user.h
new file mode 100644
index 0000000..303a7d0
--- /dev/null
+++ b/src/core/dynamic-user.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct DynamicUser DynamicUser;
+
+typedef struct DynamicCreds {
+ /* A combination of a dynamic user and group */
+ DynamicUser *user;
+ DynamicUser *group;
+} DynamicCreds;
+
+#include "manager.h"
+
+/* Note that this object always allocates a pair of user and group under the same name, even if one of them isn't
+ * used. This means, if you want to allocate a group and user pair, and they might have two different names, then you
+ * need to allocated two of these objects. DynamicCreds below makes that easy. */
+struct DynamicUser {
+ Manager *manager;
+ unsigned n_ref;
+
+ /* An AF_UNIX socket pair that contains a datagram containing both the numeric ID assigned, as well as a lock
+ * file fd locking the user ID we picked. */
+ int storage_socket[2];
+
+ char name[];
+};
+
+int dynamic_user_serialize(Manager *m, FILE *f, FDSet *fds);
+int dynamic_user_serialize_one(DynamicUser *d, const char *key, FILE *f, FDSet *fds);
+void dynamic_user_deserialize_one(Manager *m, const char *value, FDSet *fds, DynamicUser **ret);
+DynamicUser* dynamic_user_free(DynamicUser *d);
+void dynamic_user_vacuum(Manager *m, bool close_user);
+
+int dynamic_user_current(DynamicUser *d, uid_t *ret);
+int dynamic_user_lookup_uid(Manager *m, uid_t uid, char **ret);
+int dynamic_user_lookup_name(Manager *m, const char *name, uid_t *ret);
+
+int dynamic_creds_make(Manager *m, const char *user, const char *group, DynamicCreds **ret);
+int dynamic_creds_realize(DynamicCreds *creds, char **suggested_paths, uid_t *uid, gid_t *gid);
+
+DynamicCreds *dynamic_creds_unref(DynamicCreds *creds);
+DynamicCreds *dynamic_creds_destroy(DynamicCreds *creds);
+void dynamic_creds_done(DynamicCreds *creds);
+void dynamic_creds_close(DynamicCreds *creds);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(DynamicCreds*, dynamic_creds_destroy);
+
+DynamicUser *dynamic_user_ref(DynamicUser *user);
diff --git a/src/core/efi-random.c b/src/core/efi-random.c
new file mode 100644
index 0000000..dffde57
--- /dev/null
+++ b/src/core/efi-random.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "chattr-util.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+void lock_down_efi_variables(void) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ fd = open(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderSystemToken)), O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Unable to open LoaderSystemToken EFI variable, ignoring: %m");
+ return;
+ }
+
+ /* Paranoia: let's restrict access modes of these a bit, so that unprivileged users can't use them to
+ * identify the system or gain too much insight into what we might have credited to the entropy
+ * pool. */
+ r = chattr_fd(fd, 0, FS_IMMUTABLE_FL, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to drop FS_IMMUTABLE_FL from LoaderSystemToken EFI variable, ignoring: %m");
+ if (fchmod(fd, 0600) < 0)
+ log_warning_errno(errno, "Failed to reduce access mode of LoaderSystemToken EFI variable, ignoring: %m");
+}
diff --git a/src/core/efi-random.h b/src/core/efi-random.h
new file mode 100644
index 0000000..87166c9
--- /dev/null
+++ b/src/core/efi-random.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+void lock_down_efi_variables(void);
diff --git a/src/core/emergency-action.c b/src/core/emergency-action.c
new file mode 100644
index 0000000..e2cd931
--- /dev/null
+++ b/src/core/emergency-action.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/reboot.h>
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "emergency-action.h"
+#include "raw-reboot.h"
+#include "reboot-util.h"
+#include "special.h"
+#include "string-table.h"
+#include "terminal-util.h"
+#include "virt.h"
+
+static const char* const emergency_action_table[_EMERGENCY_ACTION_MAX] = {
+ [EMERGENCY_ACTION_NONE] = "none",
+ [EMERGENCY_ACTION_REBOOT] = "reboot",
+ [EMERGENCY_ACTION_REBOOT_FORCE] = "reboot-force",
+ [EMERGENCY_ACTION_REBOOT_IMMEDIATE] = "reboot-immediate",
+ [EMERGENCY_ACTION_POWEROFF] = "poweroff",
+ [EMERGENCY_ACTION_POWEROFF_FORCE] = "poweroff-force",
+ [EMERGENCY_ACTION_POWEROFF_IMMEDIATE] = "poweroff-immediate",
+ [EMERGENCY_ACTION_EXIT] = "exit",
+ [EMERGENCY_ACTION_EXIT_FORCE] = "exit-force",
+ [EMERGENCY_ACTION_SOFT_REBOOT] = "soft-reboot",
+ [EMERGENCY_ACTION_SOFT_REBOOT_FORCE] = "soft-reboot-force",
+ [EMERGENCY_ACTION_KEXEC] = "kexec",
+ [EMERGENCY_ACTION_KEXEC_FORCE] = "kexec-force",
+ [EMERGENCY_ACTION_HALT] = "halt",
+ [EMERGENCY_ACTION_HALT_FORCE] = "halt-force",
+ [EMERGENCY_ACTION_HALT_IMMEDIATE] = "halt-immediate",
+};
+
+static void log_and_status(Manager *m, bool warn, const char *message, const char *reason) {
+ log_full(warn ? LOG_WARNING : LOG_DEBUG, "%s: %s", message, reason);
+ if (warn)
+ manager_status_printf(m, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
+ "%s: %s", message, reason);
+}
+
+void emergency_action(
+ Manager *m,
+ EmergencyAction action,
+ EmergencyActionFlags options,
+ const char *reboot_arg,
+ int exit_status,
+ const char *reason) {
+
+ Unit *u;
+
+ assert(m);
+ assert(action >= 0);
+ assert(action < _EMERGENCY_ACTION_MAX);
+
+ /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+ if (IN_SET(action,
+ EMERGENCY_ACTION_REBOOT,
+ EMERGENCY_ACTION_SOFT_REBOOT,
+ EMERGENCY_ACTION_POWEROFF,
+ EMERGENCY_ACTION_EXIT,
+ EMERGENCY_ACTION_KEXEC,
+ EMERGENCY_ACTION_HALT)) {
+ u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+ if (u && unit_active_or_pending(u)) {
+ log_notice("Shutdown is already active. Skipping emergency action request %s.",
+ emergency_action_table[action]);
+ return;
+ }
+ }
+
+ if (action == EMERGENCY_ACTION_NONE)
+ return;
+
+ if (FLAGS_SET(options, EMERGENCY_ACTION_IS_WATCHDOG) && !m->service_watchdogs) {
+ log_warning("Watchdog disabled! Not acting on: %s", reason);
+ return;
+ }
+
+ bool warn = FLAGS_SET(options, EMERGENCY_ACTION_WARN);
+
+ switch (action) {
+
+ case EMERGENCY_ACTION_REBOOT:
+ log_and_status(m, warn, "Rebooting", reason);
+
+ (void) update_reboot_parameter_and_warn(reboot_arg, true);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_REBOOT_FORCE:
+ log_and_status(m, warn, "Forcibly rebooting", reason);
+
+ (void) update_reboot_parameter_and_warn(reboot_arg, true);
+ m->objective = MANAGER_REBOOT;
+ break;
+
+ case EMERGENCY_ACTION_REBOOT_IMMEDIATE:
+ log_and_status(m, warn, "Rebooting immediately", reason);
+
+ sync();
+
+ if (!isempty(reboot_arg)) {
+ log_info("Rebooting with argument '%s'.", reboot_arg);
+ (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, reboot_arg);
+ log_warning_errno(errno, "Failed to reboot with parameter, retrying without: %m");
+ }
+
+ log_info("Rebooting.");
+ (void) reboot(RB_AUTOBOOT);
+ break;
+
+ case EMERGENCY_ACTION_SOFT_REBOOT:
+ log_and_status(m, warn, "Soft-rebooting", reason);
+
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_SOFT_REBOOT_FORCE:
+ log_and_status(m, warn, "Forcibly soft-rebooting", reason);
+
+ m->objective = MANAGER_SOFT_REBOOT;
+ break;
+
+ case EMERGENCY_ACTION_EXIT:
+
+ if (exit_status >= 0)
+ m->return_value = exit_status;
+
+ if (MANAGER_IS_USER(m) || detect_container() > 0) {
+ log_and_status(m, warn, "Exiting", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+ }
+
+ log_notice("Doing \"poweroff\" action instead of an \"exit\" emergency action.");
+ _fallthrough_;
+
+ case EMERGENCY_ACTION_POWEROFF:
+ log_and_status(m, warn, "Powering off", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_EXIT_FORCE:
+
+ if (exit_status >= 0)
+ m->return_value = exit_status;
+
+ if (MANAGER_IS_USER(m) || detect_container() > 0) {
+ log_and_status(m, warn, "Exiting immediately", reason);
+ m->objective = MANAGER_EXIT;
+ break;
+ }
+
+ log_notice("Doing \"poweroff-force\" action instead of an \"exit-force\" emergency action.");
+ _fallthrough_;
+
+ case EMERGENCY_ACTION_POWEROFF_FORCE:
+ log_and_status(m, warn, "Forcibly powering off", reason);
+ m->objective = MANAGER_POWEROFF;
+ break;
+
+ case EMERGENCY_ACTION_POWEROFF_IMMEDIATE:
+ log_and_status(m, warn, "Powering off immediately", reason);
+
+ sync();
+
+ log_info("Powering off.");
+ (void) reboot(RB_POWER_OFF);
+ break;
+
+ case EMERGENCY_ACTION_KEXEC:
+ log_and_status(m, warn, "Executing kexec", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_KEXEC_FORCE:
+ log_and_status(m, warn, "Forcibly executing kexec", reason);
+ m->objective = MANAGER_KEXEC;
+ break;
+
+ case EMERGENCY_ACTION_HALT:
+ log_and_status(m, warn, "Halting", reason);
+ (void) manager_add_job_by_name_and_warn(m, JOB_START, SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY, NULL, NULL);
+ break;
+
+ case EMERGENCY_ACTION_HALT_FORCE:
+ log_and_status(m, warn, "Forcibly halting", reason);
+ m->objective = MANAGER_HALT;
+ break;
+
+ case EMERGENCY_ACTION_HALT_IMMEDIATE:
+ log_and_status(m, warn, "Halting immediately", reason);
+
+ sync();
+
+ log_info("Halting.");
+ (void) reboot(RB_HALT_SYSTEM);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+DEFINE_STRING_TABLE_LOOKUP(emergency_action, EmergencyAction);
+
+int parse_emergency_action(
+ const char *value,
+ RuntimeScope runtime_scope,
+ EmergencyAction *ret) {
+
+ EmergencyAction x;
+
+ x = emergency_action_from_string(value);
+ if (x < 0)
+ return -EINVAL;
+
+ if (runtime_scope != RUNTIME_SCOPE_SYSTEM && x != EMERGENCY_ACTION_NONE && x < _EMERGENCY_ACTION_FIRST_USER_ACTION)
+ return -EOPNOTSUPP;
+
+ *ret = x;
+ return 0;
+}
diff --git a/src/core/emergency-action.h b/src/core/emergency-action.h
new file mode 100644
index 0000000..33e0ec6
--- /dev/null
+++ b/src/core/emergency-action.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+
+#include "runtime-scope.h"
+
+typedef enum EmergencyAction {
+ EMERGENCY_ACTION_NONE,
+ EMERGENCY_ACTION_REBOOT,
+ EMERGENCY_ACTION_REBOOT_FORCE,
+ EMERGENCY_ACTION_REBOOT_IMMEDIATE,
+ EMERGENCY_ACTION_POWEROFF,
+ EMERGENCY_ACTION_POWEROFF_FORCE,
+ EMERGENCY_ACTION_POWEROFF_IMMEDIATE,
+ EMERGENCY_ACTION_EXIT,
+ _EMERGENCY_ACTION_FIRST_USER_ACTION = EMERGENCY_ACTION_EXIT,
+ EMERGENCY_ACTION_EXIT_FORCE,
+ EMERGENCY_ACTION_SOFT_REBOOT,
+ EMERGENCY_ACTION_SOFT_REBOOT_FORCE,
+ EMERGENCY_ACTION_KEXEC,
+ EMERGENCY_ACTION_KEXEC_FORCE,
+ EMERGENCY_ACTION_HALT,
+ EMERGENCY_ACTION_HALT_FORCE,
+ EMERGENCY_ACTION_HALT_IMMEDIATE,
+ _EMERGENCY_ACTION_MAX,
+ _EMERGENCY_ACTION_INVALID = -EINVAL,
+} EmergencyAction;
+
+typedef enum EmergencyActionFlags {
+ EMERGENCY_ACTION_IS_WATCHDOG = 1 << 0,
+ EMERGENCY_ACTION_WARN = 1 << 1,
+} EmergencyActionFlags;
+
+#include "macro.h"
+#include "manager.h"
+
+void emergency_action(Manager *m,
+ EmergencyAction action, EmergencyActionFlags options,
+ const char *reboot_arg, int exit_status, const char *reason);
+
+const char* emergency_action_to_string(EmergencyAction i) _const_;
+EmergencyAction emergency_action_from_string(const char *s) _pure_;
+
+int parse_emergency_action(const char *value, RuntimeScope runtime_scope, EmergencyAction *ret);
diff --git a/src/core/exec-credential.c b/src/core/exec-credential.c
new file mode 100644
index 0000000..6bcfb68
--- /dev/null
+++ b/src/core/exec-credential.c
@@ -0,0 +1,1023 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+
+#include "acl-util.h"
+#include "creds-util.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fileio.h"
+#include "glob-util.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
+ if (!sc)
+ return NULL;
+
+ free(sc->id);
+ free(sc->data);
+ return mfree(sc);
+}
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
+ if (!lc)
+ return NULL;
+
+ free(lc->id);
+ free(lc->path);
+ return mfree(lc);
+}
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ exec_set_credential_hash_ops,
+ char, string_hash_func, string_compare_func,
+ ExecSetCredential, exec_set_credential_free);
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ exec_load_credential_hash_ops,
+ char, string_hash_func, string_compare_func,
+ ExecLoadCredential, exec_load_credential_free);
+
+bool exec_context_has_credentials(const ExecContext *c) {
+ assert(c);
+
+ return !hashmap_isempty(c->set_credentials) ||
+ !hashmap_isempty(c->load_credentials) ||
+ !set_isempty(c->import_credentials);
+}
+
+bool exec_context_has_encrypted_credentials(ExecContext *c) {
+ ExecLoadCredential *load_cred;
+ ExecSetCredential *set_cred;
+
+ assert(c);
+
+ HASHMAP_FOREACH(load_cred, c->load_credentials)
+ if (load_cred->encrypted)
+ return true;
+
+ HASHMAP_FOREACH(set_cred, c->set_credentials)
+ if (set_cred->encrypted)
+ return true;
+
+ return false;
+}
+
+static int get_credential_directory(
+ const char *runtime_prefix,
+ const char *unit,
+ char **ret) {
+
+ char *p;
+
+ assert(ret);
+
+ if (!runtime_prefix || !unit) {
+ *ret = NULL;
+ return 0;
+ }
+
+ p = path_join(runtime_prefix, "credentials", unit);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return 1;
+}
+
+int exec_context_get_credential_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ char **ret) {
+
+ assert(context);
+ assert(params);
+ assert(unit);
+ assert(ret);
+
+ if (!exec_context_has_credentials(context)) {
+ *ret = NULL;
+ return 0;
+ }
+
+ return get_credential_directory(params->prefix[EXEC_DIRECTORY_RUNTIME], unit, ret);
+}
+
+int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c) {
+ _cleanup_free_ char *p = NULL, *m = NULL;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (!exec_context_has_credentials(c))
+ return 0;
+
+ /* Let's make sure the credentials directory of this service is unmounted *after* the service itself
+ * shuts down. This only matters if mount namespacing is not used for the service, and hence the
+ * credentials mount appears on the host. */
+
+ r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+ if (r <= 0)
+ return r;
+
+ r = unit_name_from_path(p, ".mount", &m);
+ if (r < 0)
+ return r;
+
+ return unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+}
+
+int exec_context_destroy_credentials(Unit *u) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+
+ r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
+ if (r <= 0)
+ return r;
+
+ /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
+ * unmount it, and afterwards remove the mount point */
+ if (umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW) >= 0)
+ (void) mount_invalidate_state_by_path(u->manager, p);
+
+ (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
+
+ return 0;
+}
+
+static int write_credential(
+ int dfd,
+ const char *id,
+ const void *data,
+ size_t size,
+ uid_t uid,
+ gid_t gid,
+ bool ownership_ok) {
+
+ _cleanup_(unlink_and_freep) char *tmp = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ r = tempfn_random_child("", "cred", &tmp);
+ if (r < 0)
+ return r;
+
+ fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
+ if (fd < 0) {
+ tmp = mfree(tmp);
+ return -errno;
+ }
+
+ r = loop_write(fd, data, size);
+ if (r < 0)
+ return r;
+
+ if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
+ return -errno;
+
+ if (uid_is_valid(uid) && uid != getuid()) {
+ r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
+ * to express: that the user gets read access and nothing
+ * else. But if the backing fs can't support that (e.g. ramfs)
+ * then we can use file ownership instead. But that's only safe if
+ * we can then re-mount the whole thing read-only, so that the
+ * user can no longer chmod() the file to gain write access. */
+ return r;
+
+ if (fchown(fd, uid, gid) < 0)
+ return -errno;
+ }
+ }
+
+ if (renameat(dfd, tmp, dfd, id) < 0)
+ return -errno;
+
+ tmp = mfree(tmp);
+ return 0;
+}
+
+typedef enum CredentialSearchPath {
+ CREDENTIAL_SEARCH_PATH_TRUSTED,
+ CREDENTIAL_SEARCH_PATH_ENCRYPTED,
+ CREDENTIAL_SEARCH_PATH_ALL,
+ _CREDENTIAL_SEARCH_PATH_MAX,
+ _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
+} CredentialSearchPath;
+
+static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
+
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(params);
+ assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
+
+ /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
+ * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
+ * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
+
+ if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+ if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
+ return NULL;
+
+ if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
+ return NULL;
+ }
+
+ if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
+ if (params->received_credentials_directory)
+ if (strv_extend(&l, params->received_credentials_directory) < 0)
+ return NULL;
+
+ if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
+ return NULL;
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *t = strv_join(l, ":");
+
+ log_debug("Credential search path is: %s", strempty(t));
+ }
+
+ return TAKE_PTR(l);
+}
+
+static int maybe_decrypt_and_write_credential(
+ int dir_fd,
+ const char *id,
+ bool encrypted,
+ uid_t uid,
+ gid_t gid,
+ bool ownership_ok,
+ const char *data,
+ size_t size,
+ uint64_t *left) {
+
+ _cleanup_free_ void *plaintext = NULL;
+ size_t add;
+ int r;
+
+ if (encrypted) {
+ size_t plaintext_size = 0;
+
+ r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
+ &plaintext, &plaintext_size);
+ if (r < 0)
+ return r;
+
+ data = plaintext;
+ size = plaintext_size;
+ }
+
+ add = strlen(id) + size;
+ if (add > *left)
+ return -E2BIG;
+
+ r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write credential '%s': %m", id);
+
+ *left -= add;
+ return 0;
+}
+
+static int load_credential_glob(
+ const char *path,
+ bool encrypted,
+ char **search_path,
+ ReadFullFileFlags flags,
+ int write_dfd,
+ uid_t uid,
+ gid_t gid,
+ bool ownership_ok,
+ uint64_t *left) {
+
+ int r;
+
+ STRV_FOREACH(d, search_path) {
+ _cleanup_globfree_ glob_t pglob = {};
+ _cleanup_free_ char *j = NULL;
+
+ j = path_join(*d, path);
+ if (!j)
+ return -ENOMEM;
+
+ r = safe_glob(j, 0, &pglob);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0)
+ return r;
+
+ for (size_t n = 0; n < pglob.gl_pathc; n++) {
+ _cleanup_free_ char *fn = NULL;
+ _cleanup_(erase_and_freep) char *data = NULL;
+ size_t size;
+
+ /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+ r = read_full_file_full(
+ AT_FDCWD,
+ pglob.gl_pathv[n],
+ UINT64_MAX,
+ encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
+ flags,
+ NULL,
+ &data, &size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read credential '%s': %m",
+ pglob.gl_pathv[n]);
+
+ r = path_extract_filename(pglob.gl_pathv[n], &fn);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract filename from '%s': %m",
+ pglob.gl_pathv[n]);
+
+ r = maybe_decrypt_and_write_credential(
+ write_dfd,
+ fn,
+ encrypted,
+ uid,
+ gid,
+ ownership_ok,
+ data, size,
+ left);
+ if (r == -EEXIST)
+ continue;
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int load_credential(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *id,
+ const char *path,
+ bool encrypted,
+ const char *unit,
+ int read_dfd,
+ int write_dfd,
+ uid_t uid,
+ gid_t gid,
+ bool ownership_ok,
+ uint64_t *left) {
+
+ ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
+ _cleanup_strv_free_ char **search_path = NULL;
+ _cleanup_(erase_and_freep) char *data = NULL;
+ _cleanup_free_ char *bindname = NULL;
+ const char *source = NULL;
+ bool missing_ok = true;
+ size_t size, maxsz;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(id);
+ assert(path);
+ assert(unit);
+ assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
+ assert(write_dfd >= 0);
+ assert(left);
+
+ if (read_dfd >= 0) {
+ /* If a directory fd is specified, then read the file directly from that dir. In this case we
+ * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
+ * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
+ * open it. */
+
+ if (!filename_is_valid(path)) /* safety check */
+ return -EINVAL;
+
+ missing_ok = true;
+ source = path;
+
+ } else if (path_is_absolute(path)) {
+ /* If this is an absolute path, read the data directly from it, and support AF_UNIX
+ * sockets */
+
+ if (!path_is_valid(path)) /* safety check */
+ return -EINVAL;
+
+ flags |= READ_FULL_FILE_CONNECT_SOCKET;
+
+ /* Pass some minimal info about the unit and the credential name we are looking to acquire
+ * via the source socket address in case we read off an AF_UNIX socket. */
+ if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
+ return -ENOMEM;
+
+ missing_ok = false;
+ source = path;
+
+ } else if (credential_name_valid(path)) {
+ /* If this is a relative path, take it as credential name relative to the credentials
+ * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
+ * are operating on a credential store, i.e. this is guaranteed to be regular files. */
+
+ search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
+ if (!search_path)
+ return -ENOMEM;
+
+ missing_ok = true;
+ } else
+ source = NULL;
+
+ if (encrypted)
+ flags |= READ_FULL_FILE_UNBASE64;
+
+ maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
+
+ if (search_path) {
+ STRV_FOREACH(d, search_path) {
+ _cleanup_free_ char *j = NULL;
+
+ j = path_join(*d, path);
+ if (!j)
+ return -ENOMEM;
+
+ r = read_full_file_full(
+ AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
+ UINT64_MAX,
+ maxsz,
+ flags,
+ NULL,
+ &data, &size);
+ if (r != -ENOENT)
+ break;
+ }
+ } else if (source)
+ r = read_full_file_full(
+ read_dfd, source,
+ UINT64_MAX,
+ maxsz,
+ flags,
+ bindname,
+ &data, &size);
+ else
+ r = -ENOENT;
+
+ if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
+ /* Make a missing inherited credential non-fatal, let's just continue. After all apps
+ * will get clear errors if we don't pass such a missing credential on as they
+ * themselves will get ENOENT when trying to read them, which should not be much
+ * worse than when we handle the error here and make it fatal.
+ *
+ * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
+ * we are fine, too. */
+ log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
+ return 0;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read credential '%s': %m", path);
+
+ return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left);
+}
+
+struct load_cred_args {
+ const ExecContext *context;
+ const ExecParameters *params;
+ bool encrypted;
+ const char *unit;
+ int dfd;
+ uid_t uid;
+ gid_t gid;
+ bool ownership_ok;
+ uint64_t *left;
+};
+
+static int load_cred_recurse_dir_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ struct load_cred_args *args = ASSERT_PTR(userdata);
+ _cleanup_free_ char *sub_id = NULL;
+ int r;
+
+ if (event != RECURSE_DIR_ENTRY)
+ return RECURSE_DIR_CONTINUE;
+
+ if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
+ return RECURSE_DIR_CONTINUE;
+
+ sub_id = strreplace(path, "/", "_");
+ if (!sub_id)
+ return -ENOMEM;
+
+ if (!credential_name_valid(sub_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
+
+ if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
+ log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
+ return RECURSE_DIR_CONTINUE;
+ }
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
+
+ r = load_credential(
+ args->context,
+ args->params,
+ sub_id,
+ de->d_name,
+ args->encrypted,
+ args->unit,
+ dir_fd,
+ args->dfd,
+ args->uid,
+ args->gid,
+ args->ownership_ok,
+ args->left);
+ if (r < 0)
+ return r;
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+static int acquire_credentials(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ const char *p,
+ uid_t uid,
+ gid_t gid,
+ bool ownership_ok) {
+
+ uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
+ _cleanup_close_ int dfd = -EBADF;
+ const char *ic;
+ ExecLoadCredential *lc;
+ ExecSetCredential *sc;
+ int r;
+
+ assert(context);
+ assert(p);
+
+ dfd = open(p, O_DIRECTORY|O_CLOEXEC);
+ if (dfd < 0)
+ return -errno;
+
+ r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
+ if (r < 0)
+ return r;
+
+ /* First, load credentials off disk (or acquire via AF_UNIX socket) */
+ HASHMAP_FOREACH(lc, context->load_credentials) {
+ _cleanup_close_ int sub_fd = -EBADF;
+
+ /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
+ * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
+ * a regular file. Finally, if it's a relative path we will use it as a credential name to
+ * propagate a credential passed to us from further up. */
+
+ if (path_is_absolute(lc->path)) {
+ sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+ if (sub_fd < 0 && !IN_SET(errno,
+ ENOTDIR, /* Not a directory */
+ ENOENT)) /* Doesn't exist? */
+ return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
+ }
+
+ if (sub_fd < 0)
+ /* Regular file (incl. a credential passed in from higher up) */
+ r = load_credential(
+ context,
+ params,
+ lc->id,
+ lc->path,
+ lc->encrypted,
+ unit,
+ AT_FDCWD,
+ dfd,
+ uid,
+ gid,
+ ownership_ok,
+ &left);
+ else
+ /* Directory */
+ r = recurse_dir(
+ sub_fd,
+ /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+ load_cred_recurse_dir_cb,
+ &(struct load_cred_args) {
+ .context = context,
+ .params = params,
+ .encrypted = lc->encrypted,
+ .unit = unit,
+ .dfd = dfd,
+ .uid = uid,
+ .gid = gid,
+ .ownership_ok = ownership_ok,
+ .left = &left,
+ });
+ if (r < 0)
+ return r;
+ }
+
+ /* Next, look for system credentials and credentials in the credentials store. Note that these do not
+ * override any credentials found earlier. */
+ SET_FOREACH(ic, context->import_credentials) {
+ _cleanup_free_ char **search_path = NULL;
+
+ search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
+ if (!search_path)
+ return -ENOMEM;
+
+ r = load_credential_glob(
+ ic,
+ /* encrypted = */ false,
+ search_path,
+ READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
+ dfd,
+ uid,
+ gid,
+ ownership_ok,
+ &left);
+ if (r < 0)
+ return r;
+
+ search_path = strv_free(search_path);
+ search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
+ if (!search_path)
+ return -ENOMEM;
+
+ r = load_credential_glob(
+ ic,
+ /* encrypted = */ true,
+ search_path,
+ READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
+ dfd,
+ uid,
+ gid,
+ ownership_ok,
+ &left);
+ if (r < 0)
+ return r;
+ }
+
+ /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
+ * add them, so that they can act as a "default" if the same credential is specified multiple times. */
+ HASHMAP_FOREACH(sc, context->set_credentials) {
+ _cleanup_(erase_and_freep) void *plaintext = NULL;
+ const char *data;
+ size_t size, add;
+
+ /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
+ * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
+ * slow and involved, hence it's nice to be able to skip that if the credential already
+ * exists anyway. */
+ if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+ continue;
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
+
+ if (sc->encrypted) {
+ r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
+ if (r < 0)
+ return r;
+
+ data = plaintext;
+ } else {
+ data = sc->data;
+ size = sc->size;
+ }
+
+ add = strlen(sc->id) + size;
+ if (add > left)
+ return -E2BIG;
+
+ r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok);
+ if (r < 0)
+ return r;
+
+ left -= add;
+ }
+
+ r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
+ if (r < 0)
+ return r;
+
+ /* After we created all keys with the right perms, also make sure the credential store as a whole is
+ * accessible */
+
+ if (uid_is_valid(uid) && uid != getuid()) {
+ r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ if (!ownership_ok)
+ return r;
+
+ if (fchown(dfd, uid, gid) < 0)
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+static int setup_credentials_internal(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ const char *final, /* This is where the credential store shall eventually end up at */
+ const char *workspace, /* This is where we can prepare it before moving it to the final place */
+ bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
+ bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
+ uid_t uid,
+ gid_t gid) {
+
+ int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
+ * if we mounted something; false if we definitely can't mount anything */
+ bool final_mounted;
+ const char *where;
+
+ assert(context);
+ assert(final);
+ assert(workspace);
+
+ if (reuse_workspace) {
+ r = path_is_mount_point(workspace, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
+ * it, let's keep this in mind */
+ else
+ workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
+ } else
+ workspace_mounted = -1; /* ditto */
+
+ r = path_is_mount_point(final, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /* If the final place already has something mounted, we use that. If the workspace also has
+ * something mounted we assume it's actually the same mount (but with MS_RDONLY
+ * different). */
+ final_mounted = true;
+
+ if (workspace_mounted < 0) {
+ /* If the final place is mounted, but the workspace isn't, then let's bind mount
+ * the final version to the workspace, and make it writable, so that we can make
+ * changes */
+
+ r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+ if (r < 0)
+ return r;
+
+ workspace_mounted = true;
+ }
+ } else
+ final_mounted = false;
+
+ if (workspace_mounted < 0) {
+ /* Nothing is mounted on the workspace yet, let's try to mount something now */
+
+ r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+ if (r < 0) {
+ /* If that didn't work, try to make a bind mount from the final to the workspace, so
+ * that we can make it writable there. */
+ r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0) {
+ if (!ERRNO_IS_PRIVILEGE(r))
+ /* Propagate anything that isn't a permission problem. */
+ return r;
+
+ if (must_mount)
+ /* If it's not OK to use the plain directory fallback, propagate all
+ * errors too. */
+ return r;
+
+ /* If we lack privileges to bind mount stuff, then let's gracefully proceed
+ * for compat with container envs, and just use the final dir as is. */
+
+ workspace_mounted = false;
+ } else {
+ /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+ if (r < 0)
+ return r;
+
+ workspace_mounted = true;
+ }
+ } else
+ workspace_mounted = true;
+ }
+
+ assert(!must_mount || workspace_mounted > 0);
+ where = workspace_mounted ? workspace : final;
+
+ (void) label_fix_full(AT_FDCWD, where, final, 0);
+
+ r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
+ if (r < 0)
+ return r;
+
+ if (workspace_mounted) {
+ bool install;
+
+ /* Determine if we should actually install the prepared mount in the final location by bind
+ * mounting it there. We do so only if the mount is not established there already, and if the
+ * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
+ * case we are doing all this in a mount namespace, thus no one else will see that we
+ * allocated a file system we are getting rid of again here. */
+ if (final_mounted)
+ install = false; /* already installed */
+ else {
+ r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+ if (r < 0)
+ return r;
+
+ install = r == 0; /* install only if non-empty */
+ }
+
+ if (install) {
+ /* Make workspace read-only now, so that any bind mount we make from it defaults to
+ * read-only too */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
+ if (r < 0)
+ return r;
+
+ /* And mount it to the final place, read-only */
+ r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
+ } else
+ /* Otherwise get rid of it */
+ r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ return r;
+ } else {
+ _cleanup_free_ char *parent = NULL;
+
+ /* If we do not have our own mount put used the plain directory fallback, then we need to
+ * open access to the top-level credential directory and the per-service directory now */
+
+ r = path_extract_directory(final, &parent);
+ if (r < 0)
+ return r;
+ if (chmod(parent, 0755) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+int exec_setup_credentials(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ uid_t uid,
+ gid_t gid) {
+
+ _cleanup_free_ char *p = NULL, *q = NULL;
+ int r;
+
+ assert(context);
+ assert(params);
+
+ if (!exec_context_has_credentials(context))
+ return 0;
+
+ if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
+ return -EINVAL;
+
+ /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
+ * and the subdir we mount over with a read-only file system readable by the service's user */
+ q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
+ if (!q)
+ return -ENOMEM;
+
+ r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ p = path_join(q, unit);
+ if (!p)
+ return -ENOMEM;
+
+ r = mkdir_label(p, 0700); /* per-unit dir: private to user */
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
+ if (r < 0) {
+ _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
+ _cleanup_free_ char *t = NULL;
+
+ /* If this is not a privilege or support issue then propagate the error */
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+
+ /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
+ * it into place, so that users can't access half-initialized credential stores. */
+ t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
+ if (!t)
+ return -ENOMEM;
+
+ /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
+ * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
+ * after it is fully set up */
+ u = path_join(t, unit);
+ if (!u)
+ return -ENOMEM;
+
+ FOREACH_STRING(i, t, u) {
+ r = mkdir_label(i, 0700);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+
+ r = setup_credentials_internal(
+ context,
+ params,
+ unit,
+ p, /* final mount point */
+ u, /* temporary workspace to overmount */
+ true, /* reuse the workspace if it is already a mount */
+ false, /* it's OK to fall back to a plain directory if we can't mount anything */
+ uid,
+ gid);
+ if (r < 0)
+ return r;
+
+ } else if (r == 0) {
+
+ /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
+ * we can use the same directory for all cases, after turning off propagation. Question
+ * though is: where do we turn off propagation exactly, and where do we place the workspace
+ * directory? We need some place that is guaranteed to be a mount point in the host, and
+ * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
+ * since we ultimately want to move the resulting file system there, i.e. we need propagation
+ * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
+ * would be visible in the host mount table all the time, which we want to avoid. Hence, what
+ * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
+ * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
+ * propagation on the former, and then overmount the latter.
+ *
+ * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
+ * for this purpose, but there are few other candidates that work equally well for us, and
+ * given that we do this in a privately namespaced short-lived single-threaded process that
+ * no one else sees this should be OK to do. */
+
+ /* Turn off propagation from our namespace to host */
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
+ if (r < 0)
+ goto child_fail;
+
+ r = setup_credentials_internal(
+ context,
+ params,
+ unit,
+ p, /* final mount point */
+ "/dev/shm", /* temporary workspace to overmount */
+ false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
+ true, /* insist that something is mounted, do not allow fallback to plain directory */
+ uid,
+ gid);
+ if (r < 0)
+ goto child_fail;
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ _exit(EXIT_FAILURE);
+ }
+
+ /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
+ * try to remove it. This matters in particular if we created the dir as mount point but then didn't
+ * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
+ * seen by users when trying access this inode. */
+ (void) rmdir(p);
+ return 0;
+}
diff --git a/src/core/exec-credential.h b/src/core/exec-credential.h
new file mode 100644
index 0000000..6f836fb
--- /dev/null
+++ b/src/core/exec-credential.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "hash-funcs.h"
+
+typedef struct ExecContext ExecContext;
+typedef struct ExecParameters ExecParameters;
+typedef struct Unit Unit;
+
+/* A credential configured with LoadCredential= */
+typedef struct ExecLoadCredential {
+ char *id, *path;
+ bool encrypted;
+} ExecLoadCredential;
+
+/* A credential configured with SetCredential= */
+typedef struct ExecSetCredential {
+ char *id;
+ bool encrypted;
+ void *data;
+ size_t size;
+} ExecSetCredential;
+
+ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSetCredential*, exec_set_credential_free);
+
+ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecLoadCredential*, exec_load_credential_free);
+
+extern const struct hash_ops exec_set_credential_hash_ops;
+extern const struct hash_ops exec_load_credential_hash_ops;
+
+bool exec_context_has_encrypted_credentials(ExecContext *c);
+bool exec_context_has_credentials(const ExecContext *c);
+
+int exec_context_get_credential_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ char **ret);
+
+int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c);
+
+int exec_context_destroy_credentials(Unit *u);
+int exec_setup_credentials(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *unit,
+ uid_t uid,
+ gid_t gid);
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
new file mode 100644
index 0000000..28d6142
--- /dev/null
+++ b/src/core/exec-invoke.c
@@ -0,0 +1,5235 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+
+#if HAVE_PAM
+#include <security/pam_appl.h>
+#include <security/pam_misc.h>
+#endif
+
+#if HAVE_APPARMOR
+#include <sys/apparmor.h>
+#endif
+
+#include "sd-messages.h"
+
+#if HAVE_APPARMOR
+#include "apparmor-util.h"
+#endif
+#include "argv-util.h"
+#include "barrier.h"
+#include "bpf-dlopen.h"
+#include "bpf-lsm.h"
+#include "btrfs-util.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "chase.h"
+#include "chattr-util.h"
+#include "chown-recursive.h"
+#include "copy.h"
+#include "data-fd-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "exec-invoke.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "iovec-util.h"
+#include "missing_ioprio.h"
+#include "missing_prctl.h"
+#include "missing_securebits.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "utmp-wtmp.h"
+
+#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
+#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
+
+#define SNDBUF_SIZE (8*1024*1024)
+
+static int shift_fds(int fds[], size_t n_fds) {
+ if (n_fds <= 0)
+ return 0;
+
+ /* Modifies the fds array! (sorts it) */
+
+ assert(fds);
+
+ for (int start = 0;;) {
+ int restart_from = -1;
+
+ for (int i = start; i < (int) n_fds; i++) {
+ int nfd;
+
+ /* Already at right index? */
+ if (fds[i] == i+3)
+ continue;
+
+ nfd = fcntl(fds[i], F_DUPFD, i + 3);
+ if (nfd < 0)
+ return -errno;
+
+ safe_close(fds[i]);
+ fds[i] = nfd;
+
+ /* Hmm, the fd we wanted isn't free? Then
+ * let's remember that and try again from here */
+ if (nfd != i+3 && restart_from < 0)
+ restart_from = i;
+ }
+
+ if (restart_from < 0)
+ break;
+
+ start = restart_from;
+ }
+
+ return 0;
+}
+
+static int flag_fds(
+ const int fds[],
+ size_t n_socket_fds,
+ size_t n_fds,
+ bool nonblock) {
+
+ int r;
+
+ assert(fds || n_fds == 0);
+
+ /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
+ * O_NONBLOCK only applies to socket activation though. */
+
+ for (size_t i = 0; i < n_fds; i++) {
+
+ if (i < n_socket_fds) {
+ r = fd_nonblock(fds[i], nonblock);
+ if (r < 0)
+ return r;
+ }
+
+ /* We unconditionally drop FD_CLOEXEC from the fds,
+ * since after all we want to pass these fds to our
+ * children */
+
+ r = fd_cloexec(fds[i], false);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static bool is_terminal_input(ExecInput i) {
+ return IN_SET(i,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL);
+}
+
+static bool is_terminal_output(ExecOutput o) {
+ return IN_SET(o,
+ EXEC_OUTPUT_TTY,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+static bool is_kmsg_output(ExecOutput o) {
+ return IN_SET(o,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE);
+}
+
+static bool exec_context_needs_term(const ExecContext *c) {
+ assert(c);
+
+ /* Return true if the execution context suggests we should set $TERM to something useful. */
+
+ if (is_terminal_input(c->std_input))
+ return true;
+
+ if (is_terminal_output(c->std_output))
+ return true;
+
+ if (is_terminal_output(c->std_error))
+ return true;
+
+ return !!c->tty_path;
+}
+
+static int open_null_as(int flags, int nfd) {
+ int fd;
+
+ assert(nfd >= 0);
+
+ fd = open("/dev/null", flags|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ return move_fd(fd, nfd, false);
+}
+
+static int connect_journal_socket(
+ int fd,
+ const char *log_namespace,
+ uid_t uid,
+ gid_t gid) {
+
+ uid_t olduid = UID_INVALID;
+ gid_t oldgid = GID_INVALID;
+ const char *j;
+ int r;
+
+ j = log_namespace ?
+ strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
+ "/run/systemd/journal/stdout";
+
+ if (gid_is_valid(gid)) {
+ oldgid = getgid();
+
+ if (setegid(gid) < 0)
+ return -errno;
+ }
+
+ if (uid_is_valid(uid)) {
+ olduid = getuid();
+
+ if (seteuid(uid) < 0) {
+ r = -errno;
+ goto restore_gid;
+ }
+ }
+
+ r = connect_unix_path(fd, AT_FDCWD, j);
+
+ /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
+ an LSM interferes. */
+
+ if (uid_is_valid(uid))
+ (void) seteuid(olduid);
+
+ restore_gid:
+ if (gid_is_valid(gid))
+ (void) setegid(oldgid);
+
+ return r;
+}
+
+static int connect_logger_as(
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecOutput output,
+ const char *ident,
+ int nfd,
+ uid_t uid,
+ gid_t gid) {
+
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(output < _EXEC_OUTPUT_MAX);
+ assert(ident);
+ assert(nfd >= 0);
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -errno;
+
+ r = connect_journal_socket(fd, context->log_namespace, uid, gid);
+ if (r < 0)
+ return r;
+
+ if (shutdown(fd, SHUT_RD) < 0)
+ return -errno;
+
+ (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
+
+ if (dprintf(fd,
+ "%s\n"
+ "%s\n"
+ "%i\n"
+ "%i\n"
+ "%i\n"
+ "%i\n"
+ "%i\n",
+ context->syslog_identifier ?: ident,
+ params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
+ context->syslog_priority,
+ !!context->syslog_level_prefix,
+ false,
+ is_kmsg_output(output),
+ is_terminal_output(output)) < 0)
+ return -errno;
+
+ return move_fd(TAKE_FD(fd), nfd, false);
+}
+
+static int open_terminal_as(const char *path, int flags, int nfd) {
+ int fd;
+
+ assert(path);
+ assert(nfd >= 0);
+
+ fd = open_terminal(path, flags | O_NOCTTY);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, nfd, false);
+}
+
+static int acquire_path(const char *path, int flags, mode_t mode) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(path);
+
+ if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
+ flags |= O_CREAT;
+
+ fd = open(path, flags|O_NOCTTY, mode);
+ if (fd >= 0)
+ return TAKE_FD(fd);
+
+ if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
+ return -errno;
+
+ /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -errno;
+
+ r = connect_unix_path(fd, AT_FDCWD, path);
+ if (IN_SET(r, -ENOTSOCK, -EINVAL))
+ /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
+ * wasn't an AF_UNIX socket after all */
+ return -ENXIO;
+ if (r < 0)
+ return r;
+
+ if ((flags & O_ACCMODE) == O_RDONLY)
+ r = shutdown(fd, SHUT_WR);
+ else if ((flags & O_ACCMODE) == O_WRONLY)
+ r = shutdown(fd, SHUT_RD);
+ else
+ r = 0;
+ if (r < 0)
+ return -errno;
+
+ return TAKE_FD(fd);
+}
+
+static int fixup_input(
+ const ExecContext *context,
+ int socket_fd,
+ bool apply_tty_stdin) {
+
+ ExecInput std_input;
+
+ assert(context);
+
+ std_input = context->std_input;
+
+ if (is_terminal_input(std_input) && !apply_tty_stdin)
+ return EXEC_INPUT_NULL;
+
+ if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
+ return EXEC_INPUT_NULL;
+
+ if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
+ return EXEC_INPUT_NULL;
+
+ return std_input;
+}
+
+static int fixup_output(ExecOutput output, int socket_fd) {
+
+ if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
+ return EXEC_OUTPUT_INHERIT;
+
+ return output;
+}
+
+static int setup_input(
+ const ExecContext *context,
+ const ExecParameters *params,
+ int socket_fd,
+ const int named_iofds[static 3]) {
+
+ ExecInput i;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(named_iofds);
+
+ if (params->stdin_fd >= 0) {
+ if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
+ return -errno;
+
+ /* Try to make this the controlling tty, if it is a tty, and reset it */
+ if (isatty(STDIN_FILENO)) {
+ (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
+
+ if (context->tty_reset)
+ (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
+
+ (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
+ }
+
+ return STDIN_FILENO;
+ }
+
+ i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+
+ switch (i) {
+
+ case EXEC_INPUT_NULL:
+ return open_null_as(O_RDONLY, STDIN_FILENO);
+
+ case EXEC_INPUT_TTY:
+ case EXEC_INPUT_TTY_FORCE:
+ case EXEC_INPUT_TTY_FAIL: {
+ _cleanup_close_ int tty_fd = -EBADF;
+ const char *tty_path;
+
+ tty_path = ASSERT_PTR(exec_context_tty_path(context));
+
+ tty_fd = acquire_terminal(tty_path,
+ i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
+ i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
+ ACQUIRE_TERMINAL_WAIT,
+ USEC_INFINITY);
+ if (tty_fd < 0)
+ return tty_fd;
+
+ r = exec_context_apply_tty_size(context, tty_fd, tty_path);
+ if (r < 0)
+ return r;
+
+ r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
+ if (r < 0)
+ return r;
+
+ TAKE_FD(tty_fd);
+ return r;
+ }
+
+ case EXEC_INPUT_SOCKET:
+ assert(socket_fd >= 0);
+
+ return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
+
+ case EXEC_INPUT_NAMED_FD:
+ assert(named_iofds[STDIN_FILENO] >= 0);
+
+ (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
+ return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
+
+ case EXEC_INPUT_DATA: {
+ int fd;
+
+ fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, STDIN_FILENO, false);
+ }
+
+ case EXEC_INPUT_FILE: {
+ bool rw;
+ int fd;
+
+ assert(context->stdio_file[STDIN_FILENO]);
+
+ rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
+ (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
+
+ fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, STDIN_FILENO, false);
+ }
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static bool can_inherit_stderr_from_stdout(
+ const ExecContext *context,
+ ExecOutput o,
+ ExecOutput e) {
+
+ assert(context);
+
+ /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
+ * stderr fd */
+
+ if (e == EXEC_OUTPUT_INHERIT)
+ return true;
+ if (e != o)
+ return false;
+
+ if (e == EXEC_OUTPUT_NAMED_FD)
+ return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
+
+ if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
+ return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
+
+ return true;
+}
+
+static int setup_output(
+ const ExecContext *context,
+ const ExecParameters *params,
+ int fileno,
+ int socket_fd,
+ const int named_iofds[static 3],
+ const char *ident,
+ uid_t uid,
+ gid_t gid,
+ dev_t *journal_stream_dev,
+ ino_t *journal_stream_ino) {
+
+ ExecOutput o;
+ ExecInput i;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ident);
+ assert(journal_stream_dev);
+ assert(journal_stream_ino);
+
+ if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
+
+ if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
+ return -errno;
+
+ return STDOUT_FILENO;
+ }
+
+ if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
+ if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
+ return -errno;
+
+ return STDERR_FILENO;
+ }
+
+ i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
+ o = fixup_output(context->std_output, socket_fd);
+
+ if (fileno == STDERR_FILENO) {
+ ExecOutput e;
+ e = fixup_output(context->std_error, socket_fd);
+
+ /* This expects the input and output are already set up */
+
+ /* Don't change the stderr file descriptor if we inherit all
+ * the way and are not on a tty */
+ if (e == EXEC_OUTPUT_INHERIT &&
+ o == EXEC_OUTPUT_INHERIT &&
+ i == EXEC_INPUT_NULL &&
+ !is_terminal_input(context->std_input) &&
+ getppid() != 1)
+ return fileno;
+
+ /* Duplicate from stdout if possible */
+ if (can_inherit_stderr_from_stdout(context, o, e))
+ return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
+
+ o = e;
+
+ } else if (o == EXEC_OUTPUT_INHERIT) {
+ /* If input got downgraded, inherit the original value */
+ if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
+ return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+ /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
+ if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
+ if (getppid() != 1)
+ return fileno;
+
+ /* We need to open /dev/null here anew, to get the right access mode. */
+ return open_null_as(O_WRONLY, fileno);
+ }
+
+ switch (o) {
+
+ case EXEC_OUTPUT_NULL:
+ return open_null_as(O_WRONLY, fileno);
+
+ case EXEC_OUTPUT_TTY:
+ if (is_terminal_input(i))
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ /* We don't reset the terminal if this is just about output */
+ return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
+
+ case EXEC_OUTPUT_KMSG:
+ case EXEC_OUTPUT_KMSG_AND_CONSOLE:
+ case EXEC_OUTPUT_JOURNAL:
+ case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
+ r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
+ if (r < 0) {
+ log_exec_warning_errno(context,
+ params,
+ r,
+ "Failed to connect %s to the journal socket, ignoring: %m",
+ fileno == STDOUT_FILENO ? "stdout" : "stderr");
+ r = open_null_as(O_WRONLY, fileno);
+ } else {
+ struct stat st;
+
+ /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
+ * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
+ * services to detect whether they are connected to the journal or not.
+ *
+ * If both stdout and stderr are connected to a stream then let's make sure to store the data
+ * about STDERR as that's usually the best way to do logging. */
+
+ if (fstat(fileno, &st) >= 0 &&
+ (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
+ *journal_stream_dev = st.st_dev;
+ *journal_stream_ino = st.st_ino;
+ }
+ }
+ return r;
+
+ case EXEC_OUTPUT_SOCKET:
+ assert(socket_fd >= 0);
+
+ return RET_NERRNO(dup2(socket_fd, fileno));
+
+ case EXEC_OUTPUT_NAMED_FD:
+ assert(named_iofds[fileno] >= 0);
+
+ (void) fd_nonblock(named_iofds[fileno], false);
+ return RET_NERRNO(dup2(named_iofds[fileno], fileno));
+
+ case EXEC_OUTPUT_FILE:
+ case EXEC_OUTPUT_FILE_APPEND:
+ case EXEC_OUTPUT_FILE_TRUNCATE: {
+ bool rw;
+ int fd, flags;
+
+ assert(context->stdio_file[fileno]);
+
+ rw = context->std_input == EXEC_INPUT_FILE &&
+ streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
+
+ if (rw)
+ return RET_NERRNO(dup2(STDIN_FILENO, fileno));
+
+ flags = O_WRONLY;
+ if (o == EXEC_OUTPUT_FILE_APPEND)
+ flags |= O_APPEND;
+ else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
+ flags |= O_TRUNC;
+
+ fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
+ if (fd < 0)
+ return fd;
+
+ return move_fd(fd, fileno, 0);
+ }
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int chown_terminal(int fd, uid_t uid) {
+ int r;
+
+ assert(fd >= 0);
+
+ /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
+ if (isatty(fd) < 1) {
+ if (IN_SET(errno, EINVAL, ENOTTY))
+ return 0; /* not a tty */
+
+ return -errno;
+ }
+
+ /* This might fail. What matters are the results. */
+ r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int setup_confirm_stdio(
+ const ExecContext *context,
+ const char *vc,
+ int *ret_saved_stdin,
+ int *ret_saved_stdout) {
+
+ _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
+ int r;
+
+ assert(ret_saved_stdin);
+ assert(ret_saved_stdout);
+
+ saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
+ if (saved_stdin < 0)
+ return -errno;
+
+ saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
+ if (saved_stdout < 0)
+ return -errno;
+
+ fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
+ if (fd < 0)
+ return fd;
+
+ r = chown_terminal(fd, getuid());
+ if (r < 0)
+ return r;
+
+ r = reset_terminal_fd(fd, /* switch_to_text= */ true);
+ if (r < 0)
+ return r;
+
+ r = exec_context_apply_tty_size(context, fd, vc);
+ if (r < 0)
+ return r;
+
+ r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
+ TAKE_FD(fd);
+ if (r < 0)
+ return r;
+
+ *ret_saved_stdin = TAKE_FD(saved_stdin);
+ *ret_saved_stdout = TAKE_FD(saved_stdout);
+ return 0;
+}
+
+static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
+ assert(err < 0);
+ assert(unit_id);
+
+ if (err == -ETIMEDOUT)
+ dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
+ else {
+ errno = -err;
+ dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
+ }
+}
+
+static void write_confirm_error(int err, const char *vc, const char *unit_id) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(vc);
+
+ fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0)
+ return;
+
+ write_confirm_error_fd(err, fd, unit_id);
+}
+
+static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
+ int r = 0;
+
+ assert(saved_stdin);
+ assert(saved_stdout);
+
+ release_terminal();
+
+ if (*saved_stdin >= 0)
+ if (dup2(*saved_stdin, STDIN_FILENO) < 0)
+ r = -errno;
+
+ if (*saved_stdout >= 0)
+ if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
+ r = -errno;
+
+ *saved_stdin = safe_close(*saved_stdin);
+ *saved_stdout = safe_close(*saved_stdout);
+
+ return r;
+}
+
+enum {
+ CONFIRM_PRETEND_FAILURE = -1,
+ CONFIRM_PRETEND_SUCCESS = 0,
+ CONFIRM_EXECUTE = 1,
+};
+
+static bool confirm_spawn_disabled(void) {
+ return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
+}
+
+static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
+ int saved_stdout = -1, saved_stdin = -1, r;
+ _cleanup_free_ char *e = NULL;
+ char c;
+
+ assert(context);
+ assert(params);
+
+ /* For any internal errors, assume a positive response. */
+ r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
+ if (r < 0) {
+ write_confirm_error(r, params->confirm_spawn, params->unit_id);
+ return CONFIRM_EXECUTE;
+ }
+
+ /* confirm_spawn might have been disabled while we were sleeping. */
+ if (!params->confirm_spawn || confirm_spawn_disabled()) {
+ r = 1;
+ goto restore_stdio;
+ }
+
+ e = ellipsize(cmdline, 60, 100);
+ if (!e) {
+ log_oom();
+ r = CONFIRM_EXECUTE;
+ goto restore_stdio;
+ }
+
+ for (;;) {
+ r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
+ if (r < 0) {
+ write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
+ r = CONFIRM_EXECUTE;
+ goto restore_stdio;
+ }
+
+ switch (c) {
+ case 'c':
+ printf("Resuming normal execution.\n");
+ manager_disable_confirm_spawn();
+ r = 1;
+ break;
+ case 'D':
+ printf(" Unit: %s\n",
+ params->unit_id);
+ exec_context_dump(context, stdout, " ");
+ exec_params_dump(params, stdout, " ");
+ continue; /* ask again */
+ case 'f':
+ printf("Failing execution.\n");
+ r = CONFIRM_PRETEND_FAILURE;
+ break;
+ case 'h':
+ printf(" c - continue, proceed without asking anymore\n"
+ " D - dump, show the state of the unit\n"
+ " f - fail, don't execute the command and pretend it failed\n"
+ " h - help\n"
+ " i - info, show a short summary of the unit\n"
+ " j - jobs, show jobs that are in progress\n"
+ " s - skip, don't execute the command and pretend it succeeded\n"
+ " y - yes, execute the command\n");
+ continue; /* ask again */
+ case 'i':
+ printf(" Unit: %s\n"
+ " Command: %s\n",
+ params->unit_id, cmdline);
+ continue; /* ask again */
+ case 'j':
+ if (sigqueue(getppid(),
+ SIGRTMIN+18,
+ (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
+ return -errno;
+
+ continue; /* ask again */
+ case 'n':
+ /* 'n' was removed in favor of 'f'. */
+ printf("Didn't understand 'n', did you mean 'f'?\n");
+ continue; /* ask again */
+ case 's':
+ printf("Skipping execution.\n");
+ r = CONFIRM_PRETEND_SUCCESS;
+ break;
+ case 'y':
+ r = CONFIRM_EXECUTE;
+ break;
+ default:
+ assert_not_reached();
+ }
+ break;
+ }
+
+restore_stdio:
+ restore_confirm_stdio(&saved_stdin, &saved_stdout);
+ return r;
+}
+
+static int get_fixed_user(
+ const char *user_or_uid,
+ const char **ret_username,
+ uid_t *ret_uid,
+ gid_t *ret_gid,
+ const char **ret_home,
+ const char **ret_shell) {
+
+ int r;
+
+ assert(user_or_uid);
+ assert(ret_username);
+
+ /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
+ * (i.e. are "/" or "/bin/nologin"). */
+
+ r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
+ if (r < 0)
+ return r;
+
+ /* user_or_uid is normalized by get_user_creds to username */
+ *ret_username = user_or_uid;
+
+ return 0;
+}
+
+static int get_fixed_group(
+ const char *group_or_gid,
+ const char **ret_groupname,
+ gid_t *ret_gid) {
+
+ int r;
+
+ assert(group_or_gid);
+ assert(ret_groupname);
+
+ r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
+ if (r < 0)
+ return r;
+
+ /* group_or_gid is normalized by get_group_creds to groupname */
+ *ret_groupname = group_or_gid;
+
+ return 0;
+}
+
+static int get_supplementary_groups(const ExecContext *c, const char *user,
+ const char *group, gid_t gid,
+ gid_t **supplementary_gids, int *ngids) {
+ int r, k = 0;
+ int ngroups_max;
+ bool keep_groups = false;
+ gid_t *groups = NULL;
+ _cleanup_free_ gid_t *l_gids = NULL;
+
+ assert(c);
+
+ /*
+ * If user is given, then lookup GID and supplementary groups list.
+ * We avoid NSS lookups for gid=0. Also we have to initialize groups
+ * here and as early as possible so we keep the list of supplementary
+ * groups of the caller.
+ */
+ if (user && gid_is_valid(gid) && gid != 0) {
+ /* First step, initialize groups from /etc/groups */
+ if (initgroups(user, gid) < 0)
+ return -errno;
+
+ keep_groups = true;
+ }
+
+ if (strv_isempty(c->supplementary_groups))
+ return 0;
+
+ /*
+ * If SupplementaryGroups= was passed then NGROUPS_MAX has to
+ * be positive, otherwise fail.
+ */
+ errno = 0;
+ ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
+ if (ngroups_max <= 0)
+ return errno_or_else(EOPNOTSUPP);
+
+ l_gids = new(gid_t, ngroups_max);
+ if (!l_gids)
+ return -ENOMEM;
+
+ if (keep_groups) {
+ /*
+ * Lookup the list of groups that the user belongs to, we
+ * avoid NSS lookups here too for gid=0.
+ */
+ k = ngroups_max;
+ if (getgrouplist(user, gid, l_gids, &k) < 0)
+ return -EINVAL;
+ } else
+ k = 0;
+
+ STRV_FOREACH(i, c->supplementary_groups) {
+ const char *g;
+
+ if (k >= ngroups_max)
+ return -E2BIG;
+
+ g = *i;
+ r = get_group_creds(&g, l_gids+k, 0);
+ if (r < 0)
+ return r;
+
+ k++;
+ }
+
+ /*
+ * Sets ngids to zero to drop all supplementary groups, happens
+ * when we are under root and SupplementaryGroups= is empty.
+ */
+ if (k == 0) {
+ *ngids = 0;
+ return 0;
+ }
+
+ /* Otherwise get the final list of supplementary groups */
+ groups = memdup(l_gids, sizeof(gid_t) * k);
+ if (!groups)
+ return -ENOMEM;
+
+ *supplementary_gids = groups;
+ *ngids = k;
+
+ groups = NULL;
+
+ return 0;
+}
+
+static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
+ int r;
+
+ /* Handle SupplementaryGroups= if it is not empty */
+ if (ngids > 0) {
+ r = maybe_setgroups(ngids, supplementary_gids);
+ if (r < 0)
+ return r;
+ }
+
+ if (gid_is_valid(gid)) {
+ /* Then set our gids */
+ if (setresgid(gid, gid, gid) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int set_securebits(unsigned bits, unsigned mask) {
+ unsigned applied;
+ int current;
+
+ current = prctl(PR_GET_SECUREBITS);
+ if (current < 0)
+ return -errno;
+
+ /* Clear all securebits defined in mask and set bits */
+ applied = ((unsigned) current & ~mask) | bits;
+ if ((unsigned) current == applied)
+ return 0;
+
+ if (prctl(PR_SET_SECUREBITS, applied) < 0)
+ return -errno;
+
+ return 1;
+}
+
+static int enforce_user(
+ const ExecContext *context,
+ uid_t uid,
+ uint64_t capability_ambient_set) {
+ assert(context);
+ int r;
+
+ if (!uid_is_valid(uid))
+ return 0;
+
+ /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
+ * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
+ * case. */
+
+ if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
+
+ /* First step: If we need to keep capabilities but drop privileges we need to make sure we
+ * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
+ r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
+ if (r < 0)
+ return r;
+ }
+
+ /* Second step: actually set the uids */
+ if (setresuid(uid, uid, uid) < 0)
+ return -errno;
+
+ /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
+ * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
+ * outside of this call. */
+ return 0;
+}
+
+#if HAVE_PAM
+
+static int null_conv(
+ int num_msg,
+ const struct pam_message **msg,
+ struct pam_response **resp,
+ void *appdata_ptr) {
+
+ /* We don't support conversations */
+
+ return PAM_CONV_ERR;
+}
+
+static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
+ int r, s;
+
+ assert(handle);
+
+ r = pam_close_session(handle, flags);
+ if (r != PAM_SUCCESS)
+ log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
+
+ s = pam_setcred(handle, PAM_DELETE_CRED | flags);
+ if (s != PAM_SUCCESS)
+ log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
+
+ return r != PAM_SUCCESS ? r : s;
+}
+
+#endif
+
+static int setup_pam(
+ const char *name,
+ const char *user,
+ uid_t uid,
+ gid_t gid,
+ const char *tty,
+ char ***env, /* updated on success */
+ const int fds[], size_t n_fds) {
+
+#if HAVE_PAM
+
+ static const struct pam_conv conv = {
+ .conv = null_conv,
+ .appdata_ptr = NULL
+ };
+
+ _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+ _cleanup_strv_free_ char **e = NULL;
+ pam_handle_t *handle = NULL;
+ sigset_t old_ss;
+ int pam_code = PAM_SUCCESS, r;
+ bool close_session = false;
+ pid_t pam_pid = 0, parent_pid;
+ int flags = 0;
+
+ assert(name);
+ assert(user);
+ assert(env);
+
+ /* We set up PAM in the parent process, then fork. The child
+ * will then stay around until killed via PR_GET_PDEATHSIG or
+ * systemd via the cgroup logic. It will then remove the PAM
+ * session again. The parent process will exec() the actual
+ * daemon. We do things this way to ensure that the main PID
+ * of the daemon is the one we initially fork()ed. */
+
+ r = barrier_create(&barrier);
+ if (r < 0)
+ goto fail;
+
+ if (log_get_max_level() < LOG_DEBUG)
+ flags |= PAM_SILENT;
+
+ pam_code = pam_start(name, user, &conv, &handle);
+ if (pam_code != PAM_SUCCESS) {
+ handle = NULL;
+ goto fail;
+ }
+
+ if (!tty) {
+ _cleanup_free_ char *q = NULL;
+
+ /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
+ * out if that's the case, and read the TTY off it. */
+
+ if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
+ tty = strjoina("/dev/", q);
+ }
+
+ if (tty) {
+ pam_code = pam_set_item(handle, PAM_TTY, tty);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+ }
+
+ STRV_FOREACH(nv, *env) {
+ pam_code = pam_putenv(handle, *nv);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+ }
+
+ pam_code = pam_acct_mgmt(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+
+ pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
+ if (pam_code != PAM_SUCCESS)
+ log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
+
+ pam_code = pam_open_session(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto fail;
+
+ close_session = true;
+
+ e = pam_getenvlist(handle);
+ if (!e) {
+ pam_code = PAM_BUF_ERR;
+ goto fail;
+ }
+
+ /* Block SIGTERM, so that we know that it won't get lost in the child */
+
+ assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
+
+ parent_pid = getpid_cached();
+
+ r = safe_fork("(sd-pam)", 0, &pam_pid);
+ if (r < 0)
+ goto fail;
+ if (r == 0) {
+ int sig, ret = EXIT_PAM;
+
+ /* The child's job is to reset the PAM session on termination */
+ barrier_set_role(&barrier, BARRIER_CHILD);
+
+ /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
+ * those fds are open here that have been opened by PAM. */
+ (void) close_many(fds, n_fds);
+
+ /* Drop privileges - we don't need any to pam_close_session and this will make
+ * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
+ * threads to fail to exit normally */
+
+ r = maybe_setgroups(0, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
+ if (setresgid(gid, gid, gid) < 0)
+ log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
+ if (setresuid(uid, uid, uid) < 0)
+ log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
+
+ (void) ignore_signals(SIGPIPE);
+
+ /* Wait until our parent died. This will only work if the above setresuid() succeeds,
+ * otherwise the kernel will not allow unprivileged parents kill their privileged children
+ * this way. We rely on the control groups kill logic to do the rest for us. */
+ if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
+ goto child_finish;
+
+ /* Tell the parent that our setup is done. This is especially important regarding dropping
+ * privileges. Otherwise, unit setup might race against our setresuid(2) call.
+ *
+ * If the parent aborted, we'll detect this below, hence ignore return failure here. */
+ (void) barrier_place(&barrier);
+
+ /* Check if our parent process might already have died? */
+ if (getppid() == parent_pid) {
+ sigset_t ss;
+
+ assert_se(sigemptyset(&ss) >= 0);
+ assert_se(sigaddset(&ss, SIGTERM) >= 0);
+
+ for (;;) {
+ if (sigwait(&ss, &sig) < 0) {
+ if (errno == EINTR)
+ continue;
+
+ goto child_finish;
+ }
+
+ assert(sig == SIGTERM);
+ break;
+ }
+ }
+
+ /* If our parent died we'll end the session */
+ if (getppid() != parent_pid) {
+ pam_code = pam_close_session_and_delete_credentials(handle, flags);
+ if (pam_code != PAM_SUCCESS)
+ goto child_finish;
+ }
+
+ ret = 0;
+
+ child_finish:
+ /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
+ * know about this. See pam_end(3) */
+ (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
+ _exit(ret);
+ }
+
+ barrier_set_role(&barrier, BARRIER_PARENT);
+
+ /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
+ * here. */
+ handle = NULL;
+
+ /* Unblock SIGTERM again in the parent */
+ assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
+
+ /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
+ * this fd around. */
+ closelog();
+
+ /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
+ * recover. However, warn loudly if it happens. */
+ if (!barrier_place_and_sync(&barrier))
+ log_error("PAM initialization failed");
+
+ return strv_free_and_replace(*env, e);
+
+fail:
+ if (pam_code != PAM_SUCCESS) {
+ log_error("PAM failed: %s", pam_strerror(handle, pam_code));
+ r = -EPERM; /* PAM errors do not map to errno */
+ } else
+ log_error_errno(r, "PAM failed: %m");
+
+ if (handle) {
+ if (close_session)
+ pam_code = pam_close_session_and_delete_credentials(handle, flags);
+
+ (void) pam_end(handle, pam_code | flags);
+ }
+
+ closelog();
+ return r;
+#else
+ return 0;
+#endif
+}
+
+static void rename_process_from_path(const char *path) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+
+ assert(path);
+
+ /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
+ * /bin/ps */
+
+ if (path_extract_filename(path, &buf) < 0) {
+ rename_process("(...)");
+ return;
+ }
+
+ size_t l = strlen(buf);
+ if (l > 8) {
+ /* The end of the process name is usually more interesting, since the first bit might just be
+ * "systemd-" */
+ p = buf + l - 8;
+ l = 8;
+ } else
+ p = buf;
+
+ char process_name[11];
+ process_name[0] = '(';
+ memcpy(process_name+1, p, l);
+ process_name[1+l] = ')';
+ process_name[1+l+1] = 0;
+
+ rename_process(process_name);
+}
+
+static bool context_has_address_families(const ExecContext *c) {
+ assert(c);
+
+ return c->address_families_allow_list ||
+ !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_allow_list ||
+ !hashmap_isempty(c->syscall_filter);
+}
+
+static bool context_has_syscall_logs(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_log_allow_list ||
+ !hashmap_isempty(c->syscall_log);
+}
+
+static bool context_has_seccomp(const ExecContext *c) {
+ /* We need NNP if we have any form of seccomp and are unprivileged */
+ return c->lock_personality ||
+ c->memory_deny_write_execute ||
+ c->private_devices ||
+ c->protect_clock ||
+ c->protect_hostname ||
+ c->protect_kernel_tunables ||
+ c->protect_kernel_modules ||
+ c->protect_kernel_logs ||
+ context_has_address_families(c) ||
+ exec_context_restrict_namespaces_set(c) ||
+ c->restrict_realtime ||
+ c->restrict_suid_sgid ||
+ !set_isempty(c->syscall_archs) ||
+ context_has_syscall_filters(c) ||
+ context_has_syscall_logs(c);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+ assert(c);
+
+ if (c->no_new_privileges)
+ return true;
+
+ if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
+ return false;
+
+ return context_has_seccomp(c);
+}
+
+#if HAVE_SECCOMP
+
+static bool seccomp_allows_drop_privileges(const ExecContext *c) {
+ void *id, *val;
+ bool has_capget = false, has_capset = false, has_prctl = false;
+
+ assert(c);
+
+ /* No syscall filter, we are allowed to drop privileges */
+ if (hashmap_isempty(c->syscall_filter))
+ return true;
+
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+ _cleanup_free_ char *name = NULL;
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+
+ if (streq(name, "capget"))
+ has_capget = true;
+ else if (streq(name, "capset"))
+ has_capset = true;
+ else if (streq(name, "prctl"))
+ has_prctl = true;
+ }
+
+ if (c->syscall_allow_list)
+ return has_capget && has_capset && has_prctl;
+ else
+ return !(has_capget || has_capset || has_prctl);
+}
+
+static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
+
+ if (is_seccomp_available())
+ return false;
+
+ log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
+ return true;
+}
+
+static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
+ uint32_t negative_action, default_action, action;
+ int r;
+
+ assert(c);
+ assert(p);
+
+ if (!context_has_syscall_filters(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
+ return 0;
+
+ negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
+
+ if (c->syscall_allow_list) {
+ default_action = negative_action;
+ action = SCMP_ACT_ALLOW;
+ } else {
+ default_action = SCMP_ACT_ALLOW;
+ action = negative_action;
+ }
+
+ if (needs_ambient_hack) {
+ r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
+ if (r < 0)
+ return r;
+ }
+
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
+}
+
+static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
+#ifdef SCMP_ACT_LOG
+ uint32_t default_action, action;
+#endif
+
+ assert(c);
+ assert(p);
+
+ if (!context_has_syscall_logs(c))
+ return 0;
+
+#ifdef SCMP_ACT_LOG
+ if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
+ return 0;
+
+ if (c->syscall_log_allow_list) {
+ /* Log nothing but the ones listed */
+ default_action = SCMP_ACT_ALLOW;
+ action = SCMP_ACT_LOG;
+ } else {
+ /* Log everything but the ones listed */
+ default_action = SCMP_ACT_LOG;
+ action = SCMP_ACT_ALLOW;
+ }
+
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
+#else
+ /* old libseccomp */
+ log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
+ return 0;
+#endif
+}
+
+static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (set_isempty(c->syscall_archs))
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
+ return 0;
+
+ return seccomp_restrict_archs(c->syscall_archs);
+}
+
+static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!context_has_address_families(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
+ return 0;
+
+ return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
+}
+
+static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
+ int r;
+
+ assert(c);
+ assert(p);
+
+ if (!c->memory_deny_write_execute)
+ return 0;
+
+ /* use prctl() if kernel supports it (6.3) */
+ r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+ if (r == 0) {
+ log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
+ return 0;
+ }
+ if (r < 0 && errno != EINVAL)
+ return log_exec_debug_errno(c,
+ p,
+ errno,
+ "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
+ /* else use seccomp */
+ log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
+
+ if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
+ return 0;
+
+ return seccomp_memory_deny_write_execute();
+}
+
+static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!c->restrict_realtime)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
+ return 0;
+
+ return seccomp_restrict_realtime();
+}
+
+static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!c->restrict_suid_sgid)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
+ return 0;
+
+ return seccomp_restrict_suid_sgid();
+}
+
+static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+ * let's protect even those systems where this is left on in the kernel. */
+
+ if (!c->protect_kernel_tunables)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
+ return 0;
+
+ return seccomp_protect_sysctl();
+}
+
+static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ /* Turn off module syscalls on ProtectKernelModules=yes */
+
+ if (!c->protect_kernel_modules)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!c->protect_kernel_logs)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
+ return 0;
+
+ return seccomp_protect_syslog();
+}
+
+static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!c->protect_clock)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "ProtectClock="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+
+ if (!c->private_devices)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
+ return 0;
+
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
+}
+
+static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
+ assert(c);
+ assert(p);
+
+ if (!exec_context_restrict_namespaces_set(c))
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
+ return 0;
+
+ return seccomp_restrict_namespaces(c->restrict_namespaces);
+}
+
+static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
+ unsigned long personality;
+ int r;
+
+ assert(c);
+ assert(p);
+
+ if (!c->lock_personality)
+ return 0;
+
+ if (skip_seccomp_unavailable(c, p, "LockPersonality="))
+ return 0;
+
+ personality = c->personality;
+
+ /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
+ if (personality == PERSONALITY_INVALID) {
+
+ r = opinionated_personality(&personality);
+ if (r < 0)
+ return r;
+ }
+
+ return seccomp_lock_personality(personality);
+}
+
+#endif
+
+#if HAVE_LIBBPF
+static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
+ int r;
+
+ assert(c);
+ assert(p);
+
+ if (!exec_context_restrict_filesystems_set(c))
+ return 0;
+
+ if (p->bpf_outer_map_fd < 0) {
+ /* LSM BPF is unsupported or lsm_bpf_setup failed */
+ log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
+ return 0;
+ }
+
+ /* We are in a new binary, so dl-open again */
+ r = dlopen_bpf();
+ if (r < 0)
+ return r;
+
+ return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
+}
+#endif
+
+static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
+ assert(c);
+ assert(p);
+
+ if (!c->protect_hostname)
+ return 0;
+
+ if (ns_type_supported(NAMESPACE_UTS)) {
+ if (unshare(CLONE_NEWUTS) < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
+ *ret_exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(c,
+ p,
+ errno,
+ "Failed to set up UTS namespacing: %m");
+ }
+
+ log_exec_warning(c,
+ p,
+ "ProtectHostname=yes is configured, but UTS namespace setup is "
+ "prohibited (container manager?), ignoring namespace setup.");
+ }
+ } else
+ log_exec_warning(c,
+ p,
+ "ProtectHostname=yes is configured, but the kernel does not "
+ "support UTS namespaces, ignoring namespace setup.");
+
+#if HAVE_SECCOMP
+ int r;
+
+ if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
+ return 0;
+
+ r = seccomp_protect_hostname();
+ if (r < 0) {
+ *ret_exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
+ }
+#endif
+
+ return 0;
+}
+
+static void do_idle_pipe_dance(int idle_pipe[static 4]) {
+ assert(idle_pipe);
+
+ idle_pipe[1] = safe_close(idle_pipe[1]);
+ idle_pipe[2] = safe_close(idle_pipe[2]);
+
+ if (idle_pipe[0] >= 0) {
+ int r;
+
+ r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
+
+ if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
+ ssize_t n;
+
+ /* Signal systemd that we are bored and want to continue. */
+ n = write(idle_pipe[3], "x", 1);
+ if (n > 0)
+ /* Wait for systemd to react to the signal above. */
+ (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
+ }
+
+ idle_pipe[0] = safe_close(idle_pipe[0]);
+
+ }
+
+ idle_pipe[3] = safe_close(idle_pipe[3]);
+}
+
+static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
+
+/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
+ * the service payload in. */
+static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
+ [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
+ [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
+ [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
+ [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
+
+static int build_environment(
+ const ExecContext *c,
+ const ExecParameters *p,
+ const CGroupContext *cgroup_context,
+ size_t n_fds,
+ const char *home,
+ const char *username,
+ const char *shell,
+ dev_t journal_stream_dev,
+ ino_t journal_stream_ino,
+ const char *memory_pressure_path,
+ char ***ret) {
+
+ _cleanup_strv_free_ char **our_env = NULL;
+ size_t n_env = 0;
+ char *x;
+ int r;
+
+ assert(c);
+ assert(p);
+ assert(ret);
+
+#define N_ENV_VARS 19
+ our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+ if (!our_env)
+ return -ENOMEM;
+
+ if (n_fds > 0) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ joined = strv_join(p->fd_names, ":");
+ if (!joined)
+ return -ENOMEM;
+
+ x = strjoin("LISTEN_FDNAMES=", joined);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
+ if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
+ * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
+ * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
+ if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
+ x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
+ * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
+ * really make much sense since we're not logged in. Hence we conditionalize the three based on
+ * SetLoginEnvironment= switch. */
+ if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
+ if (r < 0)
+ return log_exec_debug_errno(c,
+ p,
+ r,
+ "Failed to determine user credentials for root: %m");
+ }
+
+ bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
+
+ if (username) {
+ x = strjoin("USER=", username);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+
+ if (set_user_login_env) {
+ x = strjoin("LOGNAME=", username);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+ }
+
+ if (home && set_user_login_env) {
+ x = strjoin("HOME=", home);
+ if (!x)
+ return -ENOMEM;
+
+ path_simplify(x + 5);
+ our_env[n_env++] = x;
+ }
+
+ if (shell && set_user_login_env) {
+ x = strjoin("SHELL=", shell);
+ if (!x)
+ return -ENOMEM;
+
+ path_simplify(x + 6);
+ our_env[n_env++] = x;
+ }
+
+ if (!sd_id128_is_null(p->invocation_id)) {
+ assert(p->invocation_id_string);
+
+ x = strjoin("INVOCATION_ID=", p->invocation_id_string);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (exec_context_needs_term(c)) {
+ _cleanup_free_ char *cmdline = NULL;
+ const char *tty_path, *term = NULL;
+
+ tty_path = exec_context_tty_path(c);
+
+ /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
+ * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
+ * container manager passes to PID 1 ends up all the way in the console login shown. */
+
+ if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
+ term = getenv("TERM");
+ else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
+ _cleanup_free_ char *key = NULL;
+
+ key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
+ if (!key)
+ return -ENOMEM;
+
+ r = proc_cmdline_get_key(key, 0, &cmdline);
+ if (r < 0)
+ log_exec_debug_errno(c,
+ p,
+ r,
+ "Failed to read %s from kernel cmdline, ignoring: %m",
+ key);
+ else if (r > 0)
+ term = cmdline;
+ }
+
+ if (!term)
+ term = default_term_for_tty(tty_path);
+
+ x = strjoin("TERM=", term);
+ if (!x)
+ return -ENOMEM;
+ our_env[n_env++] = x;
+ }
+
+ if (journal_stream_dev != 0 && journal_stream_ino != 0) {
+ if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (c->log_namespace) {
+ x = strjoin("LOG_NAMESPACE=", c->log_namespace);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ _cleanup_free_ char *joined = NULL;
+ const char *n;
+
+ if (!p->prefix[t])
+ continue;
+
+ if (c->directories[t].n_items == 0)
+ continue;
+
+ n = exec_directory_env_name_to_string(t);
+ if (!n)
+ continue;
+
+ for (size_t i = 0; i < c->directories[t].n_items; i++) {
+ _cleanup_free_ char *prefixed = NULL;
+
+ prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
+ if (!prefixed)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&joined, ":", prefixed))
+ return -ENOMEM;
+ }
+
+ x = strjoin(n, "=", joined);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ _cleanup_free_ char *creds_dir = NULL;
+ r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
+ if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+
+ if (memory_pressure_path) {
+ x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+
+ if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
+ _cleanup_free_ char *b = NULL, *e = NULL;
+
+ if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
+ MEMORY_PRESSURE_DEFAULT_TYPE,
+ cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
+ CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
+ MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
+ return -ENOMEM;
+
+ if (base64mem(b, strlen(b) + 1, &e) < 0)
+ return -ENOMEM;
+
+ x = strjoin("MEMORY_PRESSURE_WRITE=", e);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+ }
+
+ assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
+#undef N_ENV_VARS
+
+ *ret = TAKE_PTR(our_env);
+
+ return 0;
+}
+
+static int build_pass_environment(const ExecContext *c, char ***ret) {
+ _cleanup_strv_free_ char **pass_env = NULL;
+ size_t n_env = 0;
+
+ STRV_FOREACH(i, c->pass_environment) {
+ _cleanup_free_ char *x = NULL;
+ char *v;
+
+ v = getenv(*i);
+ if (!v)
+ continue;
+ x = strjoin(*i, "=", v);
+ if (!x)
+ return -ENOMEM;
+
+ if (!GREEDY_REALLOC(pass_env, n_env + 2))
+ return -ENOMEM;
+
+ pass_env[n_env++] = TAKE_PTR(x);
+ pass_env[n_env] = NULL;
+ }
+
+ *ret = TAKE_PTR(pass_env);
+
+ return 0;
+}
+
+static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
+ _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+ _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
+ _cleanup_close_ int unshare_ready_fd = -EBADF;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ uint64_t c = 1;
+ ssize_t n;
+ int r;
+
+ /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
+ * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
+ * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+ * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+ * which waits for the parent to create the new user namespace while staying in the original namespace. The
+ * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+ * continues execution normally.
+ * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
+ * does not need CAP_SETUID to write the single line mapping to itself. */
+
+ /* Can only set up multiple mappings with CAP_SETUID. */
+ if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
+ r = asprintf(&uid_map,
+ UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
+ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
+ ouid, ouid, uid, uid);
+ else
+ r = asprintf(&uid_map,
+ UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
+ ouid, ouid);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Can only set up multiple mappings with CAP_SETGID. */
+ if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
+ r = asprintf(&gid_map,
+ GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
+ GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
+ ogid, ogid, gid, gid);
+ else
+ r = asprintf(&gid_map,
+ GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
+ ogid, ogid);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ /* Create a communication channel so that the parent can tell the child when it finished creating the user
+ * namespace. */
+ unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+ if (unshare_ready_fd < 0)
+ return -errno;
+
+ /* Create a communication channel so that the child can tell the parent a proper error code in case it
+ * failed. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return -errno;
+
+ r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ _cleanup_close_ int fd = -EBADF;
+ const char *a;
+ pid_t ppid;
+
+ /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+ * here, after the parent opened its own user namespace. */
+
+ ppid = getppid();
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* Wait until the parent unshared the user namespace */
+ if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* Disable the setgroups() system call in the child user namespace, for good. */
+ a = procfs_file_alloca(ppid, "setgroups");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* If the file is missing the kernel is too old, let's continue anyway. */
+ } else {
+ if (write(fd, "deny\n", 5) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ fd = safe_close(fd);
+ }
+
+ /* First write the GID map */
+ a = procfs_file_alloca(ppid, "gid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, gid_map, strlen(gid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ fd = safe_close(fd);
+
+ /* The write the UID map */
+ a = procfs_file_alloca(ppid, "uid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, uid_map, strlen(uid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(errno_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+
+ if (unshare(CLONE_NEWUSER) < 0)
+ return -errno;
+
+ /* Let the child know that the namespace is ready now */
+ if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+ return -errno;
+
+ /* Try to read an error code from the child */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return -errno;
+ if (n == sizeof(r)) { /* an error code was sent to us */
+ if (r < 0)
+ return r;
+ return -EIO;
+ }
+ if (n != 0) /* on success we should have read 0 bytes */
+ return -EIO;
+
+ r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
+ if (r < 0)
+ return r;
+ if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
+ return -EIO;
+
+ return 0;
+}
+
+static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
+ _cleanup_free_ char *src_abs = NULL;
+ int r;
+
+ assert(source);
+
+ src_abs = path_join(root, source);
+ if (!src_abs)
+ return -ENOMEM;
+
+ STRV_FOREACH(dst, symlinks) {
+ _cleanup_free_ char *dst_abs = NULL;
+
+ dst_abs = path_join(root, *dst);
+ if (!dst_abs)
+ return -ENOMEM;
+
+ r = mkdir_parents_label(dst_abs, 0755);
+ if (r < 0)
+ return r;
+
+ r = symlink_idempotent(src_abs, dst_abs, true);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int setup_exec_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ uid_t uid,
+ gid_t gid,
+ ExecDirectoryType type,
+ bool needs_mount_namespace,
+ int *exit_status) {
+
+ static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
+ [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
+ [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
+ [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
+ [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
+ };
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
+ assert(exit_status);
+
+ if (!params->prefix[type])
+ return 0;
+
+ if (params->flags & EXEC_CHOWN_DIRECTORIES) {
+ if (!uid_is_valid(uid))
+ uid = 0;
+ if (!gid_is_valid(gid))
+ gid = 0;
+ }
+
+ for (size_t i = 0; i < context->directories[type].n_items; i++) {
+ _cleanup_free_ char *p = NULL, *pp = NULL;
+
+ p = path_join(params->prefix[type], context->directories[type].items[i].path);
+ if (!p) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = mkdir_parents_label(p, 0755);
+ if (r < 0)
+ goto fail;
+
+ if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
+
+ /* If we are in user mode, and a configuration directory exists but a state directory
+ * doesn't exist, then we likely are upgrading from an older systemd version that
+ * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
+ * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
+ * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
+ * separated. If a service has both dirs configured but only the configuration dir
+ * exists and the state dir does not, we assume we are looking at an update
+ * situation. Hence, create a compatibility symlink, so that all expectations are
+ * met.
+ *
+ * (We also do something similar with the log directory, which still doesn't exist in
+ * the xdg basedir spec. We'll make it a subdir of the state dir.) */
+
+ /* this assumes the state dir is always created before the configuration dir */
+ assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
+ assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
+
+ r = laccess(p, F_OK);
+ if (r == -ENOENT) {
+ _cleanup_free_ char *q = NULL;
+
+ /* OK, we know that the state dir does not exist. Let's see if the dir exists
+ * under the configuration hierarchy. */
+
+ if (type == EXEC_DIRECTORY_STATE)
+ q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
+ else if (type == EXEC_DIRECTORY_LOGS)
+ q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
+ else
+ assert_not_reached();
+ if (!q) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = laccess(q, F_OK);
+ if (r >= 0) {
+ /* It does exist! This hence looks like an update. Symlink the
+ * configuration directory into the state directory. */
+
+ r = symlink_idempotent(q, p, /* make_relative= */ true);
+ if (r < 0)
+ goto fail;
+
+ log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
+ continue;
+ } else if (r != -ENOENT)
+ log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
+
+ } else if (r < 0)
+ log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
+ }
+
+ if (exec_directory_is_private(context, type)) {
+ /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
+ * case we want to avoid leaving a directory around fully accessible that is owned by
+ * a dynamic user whose UID is later on reused. To lock this down we use the same
+ * trick used by container managers to prohibit host users to get access to files of
+ * the same UID in containers: we place everything inside a directory that has an
+ * access mode of 0700 and is owned root:root, so that it acts as security boundary
+ * for unprivileged host code. We then use fs namespacing to make this directory
+ * permeable for the service itself.
+ *
+ * Specifically: for a service which wants a special directory "foo/" we first create
+ * a directory "private/" with access mode 0700 owned by root:root. Then we place
+ * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
+ * "private/foo". This way, privileged host users can access "foo/" as usual, but
+ * unprivileged host users can't look into it. Inside of the namespace of the unit
+ * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
+ * "private/foo/" is mounted under the same name, thus disabling the access boundary
+ * for the service and making sure it only gets access to the dirs it needs but no
+ * others. Tricky? Yes, absolutely, but it works!
+ *
+ * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
+ * to be owned by the service itself.
+ *
+ * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
+ * for sharing files or sockets with other services. */
+
+ pp = path_join(params->prefix[type], "private");
+ if (!pp) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
+ r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
+ if (r < 0)
+ goto fail;
+
+ if (!path_extend(&pp, context->directories[type].items[i].path)) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* Create all directories between the configured directory and this private root, and mark them 0755 */
+ r = mkdir_parents_label(pp, 0755);
+ if (r < 0)
+ goto fail;
+
+ if (is_dir(p, false) > 0 &&
+ (laccess(pp, F_OK) == -ENOENT)) {
+
+ /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
+ * it over. Most likely the service has been upgraded from one that didn't use
+ * DynamicUser=1, to one that does. */
+
+ log_exec_info(context,
+ params,
+ "Found pre-existing public %s= directory %s, migrating to %s.\n"
+ "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
+ exec_directory_type_to_string(type), p, pp);
+
+ r = RET_NERRNO(rename(p, pp));
+ if (r < 0)
+ goto fail;
+ } else {
+ /* Otherwise, create the actual directory for the service */
+
+ r = mkdir_label(pp, context->directories[type].mode);
+ if (r < 0 && r != -EEXIST)
+ goto fail;
+ }
+
+ if (!context->directories[type].items[i].only_create) {
+ /* And link it up from the original place.
+ * Notes
+ * 1) If a mount namespace is going to be used, then this symlink remains on
+ * the host, and a new one for the child namespace will be created later.
+ * 2) It is not necessary to create this symlink when one of its parent
+ * directories is specified and already created. E.g.
+ * StateDirectory=foo foo/bar
+ * In that case, the inode points to pp and p for "foo/bar" are the same:
+ * pp = "/var/lib/private/foo/bar"
+ * p = "/var/lib/foo/bar"
+ * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
+ * we do not need to create the symlink, but we cannot create the symlink.
+ * See issue #24783. */
+ r = symlink_idempotent(pp, p, true);
+ if (r < 0)
+ goto fail;
+ }
+
+ } else {
+ _cleanup_free_ char *target = NULL;
+
+ if (type != EXEC_DIRECTORY_CONFIGURATION &&
+ readlink_and_make_absolute(p, &target) >= 0) {
+ _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
+
+ /* This already exists and is a symlink? Interesting. Maybe it's one created
+ * by DynamicUser=1 (see above)?
+ *
+ * We do this for all directory types except for ConfigurationDirectory=,
+ * since they all support the private/ symlink logic at least in some
+ * configurations, see above. */
+
+ r = chase(target, NULL, 0, &target_resolved, NULL);
+ if (r < 0)
+ goto fail;
+
+ q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
+ if (!q) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* /var/lib or friends may be symlinks. So, let's chase them also. */
+ r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
+ if (r < 0)
+ goto fail;
+
+ if (path_equal(q_resolved, target_resolved)) {
+
+ /* Hmm, apparently DynamicUser= was once turned on for this service,
+ * but is no longer. Let's move the directory back up. */
+
+ log_exec_info(context,
+ params,
+ "Found pre-existing private %s= directory %s, migrating to %s.\n"
+ "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
+ exec_directory_type_to_string(type), q, p);
+
+ r = RET_NERRNO(unlink(p));
+ if (r < 0)
+ goto fail;
+
+ r = RET_NERRNO(rename(q, p));
+ if (r < 0)
+ goto fail;
+ }
+ }
+
+ r = mkdir_label(p, context->directories[type].mode);
+ if (r < 0) {
+ if (r != -EEXIST)
+ goto fail;
+
+ if (type == EXEC_DIRECTORY_CONFIGURATION) {
+ struct stat st;
+
+ /* Don't change the owner/access mode of the configuration directory,
+ * as in the common case it is not written to by a service, and shall
+ * not be writable. */
+
+ r = RET_NERRNO(stat(p, &st));
+ if (r < 0)
+ goto fail;
+
+ /* Still complain if the access mode doesn't match */
+ if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
+ log_exec_warning(context,
+ params,
+ "%s \'%s\' already exists but the mode is different. "
+ "(File system: %o %sMode: %o)",
+ exec_directory_type_to_string(type), context->directories[type].items[i].path,
+ st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
+
+ continue;
+ }
+ }
+ }
+
+ /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
+ * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
+ * current UID/GID ownership.) */
+ r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
+ if (r < 0)
+ goto fail;
+
+ /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
+ * available to user code anyway */
+ if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
+ continue;
+
+ /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
+ * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
+ * assignments to exist. */
+ r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ goto fail;
+ }
+
+ /* If we are not going to run in a namespace, set up the symlinks - otherwise
+ * they are set up later, to allow configuring empty var/run/etc. */
+ if (!needs_mount_namespace)
+ for (size_t i = 0; i < context->directories[type].n_items; i++) {
+ r = create_many_symlinks(params->prefix[type],
+ context->directories[type].items[i].path,
+ context->directories[type].items[i].symlinks);
+ if (r < 0)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ *exit_status = exit_status_table[type];
+ return r;
+}
+
+#if ENABLE_SMACK
+static int setup_smack(
+ const ExecParameters *params,
+ const ExecContext *context,
+ int executable_fd) {
+ int r;
+
+ assert(params);
+ assert(executable_fd >= 0);
+
+ if (context->smack_process_label) {
+ r = mac_smack_apply_pid(0, context->smack_process_label);
+ if (r < 0)
+ return r;
+ } else if (params->fallback_smack_process_label) {
+ _cleanup_free_ char *exec_label = NULL;
+
+ r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
+ if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+#endif
+
+static int compile_bind_mounts(
+ const ExecContext *context,
+ const ExecParameters *params,
+ BindMount **ret_bind_mounts,
+ size_t *ret_n_bind_mounts,
+ char ***ret_empty_directories) {
+
+ _cleanup_strv_free_ char **empty_directories = NULL;
+ BindMount *bind_mounts = NULL;
+ size_t n, h = 0;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_bind_mounts);
+ assert(ret_n_bind_mounts);
+ assert(ret_empty_directories);
+
+ CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
+
+ n = context->n_bind_mounts;
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!params->prefix[t])
+ continue;
+
+ for (size_t i = 0; i < context->directories[t].n_items; i++)
+ n += !context->directories[t].items[i].only_create;
+ }
+
+ if (n <= 0) {
+ *ret_bind_mounts = NULL;
+ *ret_n_bind_mounts = 0;
+ *ret_empty_directories = NULL;
+ return 0;
+ }
+
+ bind_mounts = new(BindMount, n);
+ if (!bind_mounts)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < context->n_bind_mounts; i++) {
+ BindMount *item = context->bind_mounts + i;
+ _cleanup_free_ char *s = NULL, *d = NULL;
+
+ s = strdup(item->source);
+ if (!s)
+ return -ENOMEM;
+
+ d = strdup(item->destination);
+ if (!d)
+ return -ENOMEM;
+
+ bind_mounts[h++] = (BindMount) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .read_only = item->read_only,
+ .recursive = item->recursive,
+ .ignore_enoent = item->ignore_enoent,
+ };
+ }
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!params->prefix[t])
+ continue;
+
+ if (context->directories[t].n_items == 0)
+ continue;
+
+ if (exec_directory_is_private(context, t) &&
+ !exec_context_with_rootfs(context)) {
+ char *private_root;
+
+ /* So this is for a dynamic user, and we need to make sure the process can access its own
+ * directory. For that we overmount the usually inaccessible "private" subdirectory with a
+ * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
+
+ private_root = path_join(params->prefix[t], "private");
+ if (!private_root)
+ return -ENOMEM;
+
+ r = strv_consume(&empty_directories, private_root);
+ if (r < 0)
+ return r;
+ }
+
+ for (size_t i = 0; i < context->directories[t].n_items; i++) {
+ _cleanup_free_ char *s = NULL, *d = NULL;
+
+ /* When one of the parent directories is in the list, we cannot create the symlink
+ * for the child directory. See also the comments in setup_exec_directory(). */
+ if (context->directories[t].items[i].only_create)
+ continue;
+
+ if (exec_directory_is_private(context, t))
+ s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
+ else
+ s = path_join(params->prefix[t], context->directories[t].items[i].path);
+ if (!s)
+ return -ENOMEM;
+
+ if (exec_directory_is_private(context, t) &&
+ exec_context_with_rootfs(context))
+ /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
+ * directory is not created on the root directory. So, let's bind-mount the directory
+ * on the 'non-private' place. */
+ d = path_join(params->prefix[t], context->directories[t].items[i].path);
+ else
+ d = strdup(s);
+ if (!d)
+ return -ENOMEM;
+
+ bind_mounts[h++] = (BindMount) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .read_only = false,
+ .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
+ .recursive = true,
+ .ignore_enoent = false,
+ };
+ }
+ }
+
+ assert(h == n);
+
+ *ret_bind_mounts = TAKE_PTR(bind_mounts);
+ *ret_n_bind_mounts = n;
+ *ret_empty_directories = TAKE_PTR(empty_directories);
+
+ return (int) n;
+}
+
+/* ret_symlinks will contain a list of pairs src:dest that describes
+ * the symlinks to create later on. For example, the symlinks needed
+ * to safely give private directories to DynamicUser=1 users. */
+static int compile_symlinks(
+ const ExecContext *context,
+ const ExecParameters *params,
+ bool setup_os_release_symlink,
+ char ***ret_symlinks) {
+
+ _cleanup_strv_free_ char **symlinks = NULL;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_symlinks);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ for (size_t i = 0; i < context->directories[dt].n_items; i++) {
+ _cleanup_free_ char *private_path = NULL, *path = NULL;
+
+ STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
+ _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+ src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ dst_abs = path_join(params->prefix[dt], *symlink);
+ if (!src_abs || !dst_abs)
+ return -ENOMEM;
+
+ r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
+ if (r < 0)
+ return r;
+ }
+
+ if (!exec_directory_is_private(context, dt) ||
+ exec_context_with_rootfs(context) ||
+ context->directories[dt].items[i].only_create)
+ continue;
+
+ private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
+ if (!private_path)
+ return -ENOMEM;
+
+ path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
+ if (!path)
+ return -ENOMEM;
+
+ r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* We make the host's os-release available via a symlink, so that we can copy it atomically
+ * and readers will never get a half-written version. Note that, while the paths specified here are
+ * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
+ * 'os-release -> .os-release-stage/os-release' is what will be created. */
+ if (setup_os_release_symlink) {
+ r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(&symlinks, "/run/host/os-release");
+ if (r < 0)
+ return r;
+ }
+
+ *ret_symlinks = TAKE_PTR(symlinks);
+
+ return 0;
+}
+
+static bool insist_on_sandboxing(
+ const ExecContext *context,
+ const char *root_dir,
+ const char *root_image,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts) {
+
+ assert(context);
+ assert(n_bind_mounts == 0 || bind_mounts);
+
+ /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
+ * would alter the view on the file system beyond making things read-only or invisible, i.e. would
+ * rearrange stuff in a way we cannot ignore gracefully. */
+
+ if (context->n_temporary_filesystems > 0)
+ return true;
+
+ if (root_dir || root_image)
+ return true;
+
+ if (context->n_mount_images > 0)
+ return true;
+
+ if (context->dynamic_user)
+ return true;
+
+ if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
+ return true;
+
+ /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
+ * essential. */
+ for (size_t i = 0; i < n_bind_mounts; i++)
+ if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
+ return true;
+
+ if (context->log_namespace)
+ return true;
+
+ return false;
+}
+
+static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ if (!runtime || !runtime->ephemeral_copy)
+ return 0;
+
+ r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
+
+ CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
+
+ fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+ if (fd >= 0)
+ /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
+ return 0;
+
+ if (fd != -EAGAIN)
+ return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
+
+ log_debug("Making ephemeral snapshot of %s to %s",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image)
+ fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
+ COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
+ else
+ fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
+ AT_FDCWD, runtime->ephemeral_copy,
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_LOCK_BSD);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image) {
+ /* A root image might be subject to lots of random writes so let's try to disable COW on it
+ * which tends to not perform well in combination with lots of random writes.
+ *
+ * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
+ * copy, but we at least want to make the intention clear.
+ */
+ r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+ if (r < 0)
+ log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+ }
+
+ r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+
+ return 1;
+}
+
+static int verity_settings_prepare(
+ VeritySettings *verity,
+ const char *root_image,
+ const void *root_hash,
+ size_t root_hash_size,
+ const char *root_hash_path,
+ const void *root_hash_sig,
+ size_t root_hash_sig_size,
+ const char *root_hash_sig_path,
+ const char *verity_data_path) {
+
+ int r;
+
+ assert(verity);
+
+ if (root_hash) {
+ void *d;
+
+ d = memdup(root_hash, root_hash_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash, d);
+ verity->root_hash_size = root_hash_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (root_hash_sig) {
+ void *d;
+
+ d = memdup(root_hash_sig, root_hash_sig_size);
+ if (!d)
+ return -ENOMEM;
+
+ free_and_replace(verity->root_hash_sig, d);
+ verity->root_hash_sig_size = root_hash_sig_size;
+ verity->designator = PARTITION_ROOT;
+ }
+
+ if (verity_data_path) {
+ r = free_and_strdup(&verity->data_path, verity_data_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = verity_settings_load(
+ verity,
+ root_image,
+ root_hash_path,
+ root_hash_sig_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load root hash: %m");
+
+ return 0;
+}
+
+static int apply_mount_namespace(
+ ExecCommandFlags command_flags,
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecRuntime *runtime,
+ const char *memory_pressure_path,
+ char **error_path) {
+
+ _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+ _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
+ **read_write_paths_cleanup = NULL;
+ _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
+ *extension_dir = NULL, *host_os_release_stage = NULL;
+ const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
+ char **read_write_paths;
+ bool needs_sandboxing, setup_os_release_symlink;
+ BindMount *bind_mounts = NULL;
+ size_t n_bind_mounts = 0;
+ int r;
+
+ assert(context);
+
+ CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
+
+ if (params->flags & EXEC_APPLY_CHROOT) {
+ r = setup_ephemeral(context, runtime);
+ if (r < 0)
+ return r;
+
+ if (context->root_image)
+ root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
+ else
+ root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
+ }
+
+ r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
+ if (r < 0)
+ return r;
+
+ /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
+ * service will need to write to it in order to start the notifications. */
+ if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+ read_write_paths_cleanup = strv_copy(context->read_write_paths);
+ if (!read_write_paths_cleanup)
+ return -ENOMEM;
+
+ r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
+ if (r < 0)
+ return r;
+
+ read_write_paths = read_write_paths_cleanup;
+ } else
+ read_write_paths = context->read_write_paths;
+
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+ if (needs_sandboxing) {
+ /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
+ * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
+ * use here. This does not apply when we are using /run/systemd/empty as fallback. */
+
+ if (context->private_tmp && runtime && runtime->shared) {
+ if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
+ tmp_dir = runtime->shared->tmp_dir;
+ else if (runtime->shared->tmp_dir)
+ tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
+
+ if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
+ var_tmp_dir = runtime->shared->var_tmp_dir;
+ else if (runtime->shared->var_tmp_dir)
+ var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
+ }
+ }
+
+ /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
+ setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
+ r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
+ if (r < 0)
+ return r;
+
+ if (context->mount_propagation_flag == MS_SHARED)
+ log_exec_debug(context,
+ params,
+ "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
+
+ if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+ r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
+ if (r < 0)
+ return r;
+ }
+
+ if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
+ if (!propagate_dir)
+ return -ENOMEM;
+
+ incoming_dir = strdup("/run/systemd/incoming");
+ if (!incoming_dir)
+ return -ENOMEM;
+
+ extension_dir = strdup("/run/systemd/unit-extensions");
+ if (!extension_dir)
+ return -ENOMEM;
+
+ /* If running under a different root filesystem, propagate the host's os-release. We make a
+ * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
+ if (setup_os_release_symlink) {
+ host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
+ if (!host_os_release_stage)
+ return -ENOMEM;
+ }
+ } else {
+ assert(params->runtime_scope == RUNTIME_SCOPE_USER);
+
+ if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
+ return -ENOMEM;
+
+ if (setup_os_release_symlink) {
+ if (asprintf(&host_os_release_stage,
+ "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
+ geteuid()) < 0)
+ return -ENOMEM;
+ }
+ }
+
+ if (root_image) {
+ r = verity_settings_prepare(
+ &verity,
+ root_image,
+ context->root_hash, context->root_hash_size, context->root_hash_path,
+ context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
+ context->root_verity);
+ if (r < 0)
+ return r;
+ }
+
+ NamespaceParameters parameters = {
+ .runtime_scope = params->runtime_scope,
+
+ .root_directory = root_dir,
+ .root_image = root_image,
+ .root_image_options = context->root_image_options,
+ .root_image_policy = context->root_image_policy ?: &image_policy_service,
+
+ .read_write_paths = read_write_paths,
+ .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
+ .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
+
+ .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
+ .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
+
+ .empty_directories = empty_directories,
+ .symlinks = symlinks,
+
+ .bind_mounts = bind_mounts,
+ .n_bind_mounts = n_bind_mounts,
+
+ .temporary_filesystems = context->temporary_filesystems,
+ .n_temporary_filesystems = context->n_temporary_filesystems,
+
+ .mount_images = context->mount_images,
+ .n_mount_images = context->n_mount_images,
+ .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
+
+ .tmp_dir = tmp_dir,
+ .var_tmp_dir = var_tmp_dir,
+
+ .creds_path = creds_path,
+ .log_namespace = context->log_namespace,
+ .mount_propagation_flag = context->mount_propagation_flag,
+
+ .verity = &verity,
+
+ .extension_images = context->extension_images,
+ .n_extension_images = context->n_extension_images,
+ .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
+ .extension_directories = context->extension_directories,
+
+ .propagate_dir = propagate_dir,
+ .incoming_dir = incoming_dir,
+ .extension_dir = extension_dir,
+ .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
+ .host_os_release_stage = host_os_release_stage,
+
+ /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
+ * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
+ * sandbox inside the mount namespace. */
+ .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
+
+ .protect_control_groups = needs_sandboxing && context->protect_control_groups,
+ .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
+ .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
+ .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
+ .protect_hostname = needs_sandboxing && context->protect_hostname,
+
+ .private_dev = needs_sandboxing && context->private_devices,
+ .private_network = needs_sandboxing && exec_needs_network_namespace(context),
+ .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
+
+ .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
+
+ /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
+ .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
+
+ .protect_home = needs_sandboxing ? context->protect_home : false,
+ .protect_system = needs_sandboxing ? context->protect_system : false,
+ .protect_proc = needs_sandboxing ? context->protect_proc : false,
+ .proc_subset = needs_sandboxing ? context->proc_subset : false,
+ };
+
+ r = setup_namespace(&parameters, error_path);
+ /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
+ * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
+ * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
+ * completely different execution environment. */
+ if (r == -ENOANO) {
+ if (insist_on_sandboxing(
+ context,
+ root_dir, root_image,
+ bind_mounts,
+ n_bind_mounts))
+ return log_exec_debug_errno(context,
+ params,
+ SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Failed to set up namespace, and refusing to continue since "
+ "the selected namespacing options alter mount environment non-trivially.\n"
+ "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+ n_bind_mounts,
+ context->n_temporary_filesystems,
+ yes_no(root_dir),
+ yes_no(root_image),
+ yes_no(context->dynamic_user));
+
+ log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
+ return 0;
+ }
+
+ return r;
+}
+
+static int apply_working_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecRuntime *runtime,
+ const char *home,
+ int *exit_status) {
+
+ const char *d, *wd;
+
+ assert(context);
+ assert(exit_status);
+
+ if (context->working_directory_home) {
+
+ if (!home) {
+ *exit_status = EXIT_CHDIR;
+ return -ENXIO;
+ }
+
+ wd = home;
+
+ } else
+ wd = empty_to_root(context->working_directory);
+
+ if (params->flags & EXEC_APPLY_CHROOT)
+ d = wd;
+ else
+ d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
+
+ if (chdir(d) < 0 && !context->working_directory_missing_ok) {
+ *exit_status = EXIT_CHDIR;
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int apply_root_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ ExecRuntime *runtime,
+ const bool needs_mount_ns,
+ int *exit_status) {
+
+ assert(context);
+ assert(exit_status);
+
+ if (params->flags & EXEC_APPLY_CHROOT)
+ if (!needs_mount_ns && context->root_directory)
+ if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
+ *exit_status = EXIT_CHROOT;
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int setup_keyring(
+ const ExecContext *context,
+ const ExecParameters *p,
+ uid_t uid, gid_t gid) {
+
+ key_serial_t keyring;
+ int r = 0;
+ uid_t saved_uid;
+ gid_t saved_gid;
+
+ assert(context);
+ assert(p);
+
+ /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
+ * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
+ * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
+ * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
+ * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
+ * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
+
+ if (context->keyring_mode == EXEC_KEYRING_INHERIT)
+ return 0;
+
+ /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
+ * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
+ * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
+ * & group is just as nasty as acquiring a reference to the user keyring. */
+
+ saved_uid = getuid();
+ saved_gid = getgid();
+
+ if (gid_is_valid(gid) && gid != saved_gid) {
+ if (setregid(gid, -1) < 0)
+ return log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to change GID for user keyring: %m");
+ }
+
+ if (uid_is_valid(uid) && uid != saved_uid) {
+ if (setreuid(uid, -1) < 0) {
+ r = log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to change UID for user keyring: %m");
+ goto out;
+ }
+ }
+
+ keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
+ if (keyring == -1) {
+ if (errno == ENOSYS)
+ log_exec_debug_errno(context,
+ p,
+ errno,
+ "Kernel keyring not supported, ignoring.");
+ else if (ERRNO_IS_PRIVILEGE(errno))
+ log_exec_debug_errno(context,
+ p,
+ errno,
+ "Kernel keyring access prohibited, ignoring.");
+ else if (errno == EDQUOT)
+ log_exec_debug_errno(context,
+ p,
+ errno,
+ "Out of kernel keyrings to allocate, ignoring.");
+ else
+ r = log_exec_error_errno(context,
+ p,
+ errno,
+ "Setting up kernel keyring failed: %m");
+
+ goto out;
+ }
+
+ /* When requested link the user keyring into the session keyring. */
+ if (context->keyring_mode == EXEC_KEYRING_SHARED) {
+
+ if (keyctl(KEYCTL_LINK,
+ KEY_SPEC_USER_KEYRING,
+ KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
+ r = log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to link user keyring into session keyring: %m");
+ goto out;
+ }
+ }
+
+ /* Restore uid/gid back */
+ if (uid_is_valid(uid) && uid != saved_uid) {
+ if (setreuid(saved_uid, -1) < 0) {
+ r = log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to change UID back for user keyring: %m");
+ goto out;
+ }
+ }
+
+ if (gid_is_valid(gid) && gid != saved_gid) {
+ if (setregid(saved_gid, -1) < 0)
+ return log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to change GID back for user keyring: %m");
+ }
+
+ /* Populate they keyring with the invocation ID by default, as original saved_uid. */
+ if (!sd_id128_is_null(p->invocation_id)) {
+ key_serial_t key;
+
+ key = add_key("user",
+ "invocation_id",
+ &p->invocation_id,
+ sizeof(p->invocation_id),
+ KEY_SPEC_SESSION_KEYRING);
+ if (key == -1)
+ log_exec_debug_errno(context,
+ p,
+ errno,
+ "Failed to add invocation ID to keyring, ignoring: %m");
+ else {
+ if (keyctl(KEYCTL_SETPERM, key,
+ KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
+ KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
+ r = log_exec_error_errno(context,
+ p,
+ errno,
+ "Failed to restrict invocation ID permission: %m");
+ }
+ }
+
+out:
+ /* Revert back uid & gid for the last time, and exit */
+ /* no extra logging, as only the first already reported error matters */
+ if (getuid() != saved_uid)
+ (void) setreuid(saved_uid, -1);
+
+ if (getgid() != saved_gid)
+ (void) setregid(saved_gid, -1);
+
+ return r;
+}
+
+static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
+ assert(array);
+ assert(n);
+ assert(pair);
+
+ if (pair[0] >= 0)
+ array[(*n)++] = pair[0];
+ if (pair[1] >= 0)
+ array[(*n)++] = pair[1];
+}
+
+static int close_remaining_fds(
+ const ExecParameters *params,
+ const ExecRuntime *runtime,
+ int socket_fd,
+ const int *fds, size_t n_fds) {
+
+ size_t n_dont_close = 0;
+ int dont_close[n_fds + 14];
+
+ assert(params);
+
+ if (params->stdin_fd >= 0)
+ dont_close[n_dont_close++] = params->stdin_fd;
+ if (params->stdout_fd >= 0)
+ dont_close[n_dont_close++] = params->stdout_fd;
+ if (params->stderr_fd >= 0)
+ dont_close[n_dont_close++] = params->stderr_fd;
+
+ if (socket_fd >= 0)
+ dont_close[n_dont_close++] = socket_fd;
+ if (n_fds > 0) {
+ memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
+ n_dont_close += n_fds;
+ }
+
+ if (runtime)
+ append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
+
+ if (runtime && runtime->shared) {
+ append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
+ append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
+ }
+
+ if (runtime && runtime->dynamic_creds) {
+ if (runtime->dynamic_creds->user)
+ append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
+ if (runtime->dynamic_creds->group)
+ append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
+ }
+
+ if (params->user_lookup_fd >= 0)
+ dont_close[n_dont_close++] = params->user_lookup_fd;
+
+ return close_all_fds(dont_close, n_dont_close);
+}
+
+static int send_user_lookup(
+ const char *unit_id,
+ int user_lookup_fd,
+ uid_t uid,
+ gid_t gid) {
+
+ assert(unit_id);
+
+ /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
+ * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
+ * specified. */
+
+ if (user_lookup_fd < 0)
+ return 0;
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid))
+ return 0;
+
+ if (writev(user_lookup_fd,
+ (struct iovec[]) {
+ IOVEC_MAKE(&uid, sizeof(uid)),
+ IOVEC_MAKE(&gid, sizeof(gid)),
+ IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
+ return -errno;
+
+ return 0;
+}
+
+static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
+ int r;
+
+ assert(c);
+ assert(home);
+ assert(buf);
+
+ /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
+
+ if (*home)
+ return 0;
+
+ if (!c->working_directory_home)
+ return 0;
+
+ r = get_home_dir(buf);
+ if (r < 0)
+ return r;
+
+ *home = *buf;
+ return 1;
+}
+
+static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
+ _cleanup_strv_free_ char ** list = NULL;
+ int r;
+
+ assert(c);
+ assert(p);
+ assert(ret);
+
+ assert(c->dynamic_user);
+
+ /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
+ * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
+ * directories. */
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (t == EXEC_DIRECTORY_CONFIGURATION)
+ continue;
+
+ if (!p->prefix[t])
+ continue;
+
+ for (size_t i = 0; i < c->directories[t].n_items; i++) {
+ char *e;
+
+ if (exec_directory_is_private(c, t))
+ e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
+ else
+ e = path_join(p->prefix[t], c->directories[t].items[i].path);
+ if (!e)
+ return -ENOMEM;
+
+ r = strv_consume(&list, e);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ *ret = TAKE_PTR(list);
+
+ return 0;
+}
+
+static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ if (!c->numa_policy.nodes.set) {
+ log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
+ return 0;
+ }
+
+ r = numa_to_cpu_set(&c->numa_policy, &s);
+ if (r < 0)
+ return r;
+
+ cpu_set_reset(ret);
+
+ return cpu_set_add_all(ret, &s);
+}
+
+static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
+ int r;
+
+ assert(fds);
+ assert(n_fds);
+ assert(*n_fds < fds_size);
+ assert(fd);
+
+ if (*fd < 0)
+ return 0;
+
+ if (*fd < 3 + (int) *n_fds) {
+ /* Let's move the fd up, so that it's outside of the fd range we will use to store
+ * the fds we pass to the process (or which are closed only during execve). */
+
+ r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
+ if (r < 0)
+ return -errno;
+
+ close_and_replace(*fd, r);
+ }
+
+ fds[(*n_fds)++] = *fd;
+ return 1;
+}
+
+static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
+ union sockaddr_union addr = {
+ .un.sun_family = AF_UNIX,
+ };
+ socklen_t sa_len;
+ static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
+ int r;
+
+ assert(c);
+ assert(p);
+ assert(of);
+ assert(ofd >= 0);
+
+ r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
+ if (r < 0)
+ return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
+
+ sa_len = r;
+
+ for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
+ if (fd < 0)
+ return log_exec_error_errno(c,
+ p,
+ errno,
+ "Failed to create socket for %s: %m",
+ of->path);
+
+ r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
+ if (r == -EPROTOTYPE)
+ continue;
+ if (r < 0)
+ return log_exec_error_errno(c,
+ p,
+ r,
+ "Failed to connect socket for %s: %m",
+ of->path);
+
+ return TAKE_FD(fd);
+ }
+
+ return log_exec_error_errno(c,
+ p,
+ SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
+ of->path);
+}
+
+static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
+ struct stat st;
+ _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
+
+ assert(c);
+ assert(p);
+ assert(of);
+
+ ofd = open(of->path, O_PATH | O_CLOEXEC);
+ if (ofd < 0)
+ return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
+
+ if (fstat(ofd, &st) < 0)
+ return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
+
+ if (S_ISSOCK(st.st_mode)) {
+ fd = connect_unix_harder(c, p, of, ofd);
+ if (fd < 0)
+ return fd;
+
+ if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
+ return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
+ of->path);
+
+ log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
+ } else {
+ int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
+ if (FLAGS_SET(of->flags, OPENFILE_APPEND))
+ flags |= O_APPEND;
+ else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
+ flags |= O_TRUNC;
+
+ fd = fd_reopen(ofd, flags | O_CLOEXEC);
+ if (fd < 0)
+ return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
+
+ log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
+ }
+
+ return TAKE_FD(fd);
+}
+
+static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
+ int r;
+
+ assert(c);
+ assert(p);
+ assert(n_fds);
+
+ LIST_FOREACH(open_files, of, p->open_files) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = get_open_file_fd(c, p, of);
+ if (fd < 0) {
+ if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
+ log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
+ continue;
+ }
+
+ return fd;
+ }
+
+ if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
+ return -ENOMEM;
+
+ r = strv_extend(&p->fd_names, of->fdname);
+ if (r < 0)
+ return r;
+
+ p->fds[*n_fds] = TAKE_FD(fd);
+
+ (*n_fds)++;
+ }
+
+ return 0;
+}
+
+static void log_command_line(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *msg,
+ const char *executable,
+ char **argv) {
+
+ assert(context);
+ assert(params);
+ assert(msg);
+ assert(executable);
+
+ if (!DEBUG_LOGGING)
+ return;
+
+ _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
+
+ log_exec_struct(context, params, LOG_DEBUG,
+ "EXECUTABLE=%s", executable,
+ LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
+ LOG_EXEC_INVOCATION_ID(params));
+}
+
+static bool exec_context_need_unprivileged_private_users(
+ const ExecContext *context,
+ const ExecParameters *params) {
+
+ assert(context);
+ assert(params);
+
+ /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
+ * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
+ * (system manager) then we have privileges and don't need this. */
+ if (params->runtime_scope != RUNTIME_SCOPE_USER)
+ return false;
+
+ return context->private_users ||
+ context->private_tmp ||
+ context->private_devices ||
+ context->private_network ||
+ context->network_namespace_path ||
+ context->private_ipc ||
+ context->ipc_namespace_path ||
+ context->private_mounts > 0 ||
+ context->mount_apivfs ||
+ context->n_bind_mounts > 0 ||
+ context->n_temporary_filesystems > 0 ||
+ context->root_directory ||
+ !strv_isempty(context->extension_directories) ||
+ context->protect_system != PROTECT_SYSTEM_NO ||
+ context->protect_home != PROTECT_HOME_NO ||
+ context->protect_kernel_tunables ||
+ context->protect_kernel_modules ||
+ context->protect_kernel_logs ||
+ context->protect_control_groups ||
+ context->protect_clock ||
+ context->protect_hostname ||
+ !strv_isempty(context->read_write_paths) ||
+ !strv_isempty(context->read_only_paths) ||
+ !strv_isempty(context->inaccessible_paths) ||
+ !strv_isempty(context->exec_paths) ||
+ !strv_isempty(context->no_exec_paths);
+}
+
+static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
+ assert(context);
+
+ if (confirm_spawn_disabled())
+ return false;
+
+ /* For some reasons units remaining in the same process group
+ * as PID 1 fail to acquire the console even if it's not used
+ * by any process. So skip the confirmation question for them. */
+ return !context->same_pgrp;
+}
+
+static int exec_context_named_iofds(
+ const ExecContext *c,
+ const ExecParameters *p,
+ int named_iofds[static 3]) {
+
+ size_t targets;
+ const char* stdio_fdname[3];
+ size_t n_fds;
+
+ assert(c);
+ assert(p);
+ assert(named_iofds);
+
+ targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
+ (c->std_output == EXEC_OUTPUT_NAMED_FD) +
+ (c->std_error == EXEC_OUTPUT_NAMED_FD);
+
+ for (size_t i = 0; i < 3; i++)
+ stdio_fdname[i] = exec_context_fdname(c, i);
+
+ n_fds = p->n_storage_fds + p->n_socket_fds;
+
+ for (size_t i = 0; i < n_fds && targets > 0; i++)
+ if (named_iofds[STDIN_FILENO] < 0 &&
+ c->std_input == EXEC_INPUT_NAMED_FD &&
+ stdio_fdname[STDIN_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
+
+ named_iofds[STDIN_FILENO] = p->fds[i];
+ targets--;
+
+ } else if (named_iofds[STDOUT_FILENO] < 0 &&
+ c->std_output == EXEC_OUTPUT_NAMED_FD &&
+ stdio_fdname[STDOUT_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
+
+ named_iofds[STDOUT_FILENO] = p->fds[i];
+ targets--;
+
+ } else if (named_iofds[STDERR_FILENO] < 0 &&
+ c->std_error == EXEC_OUTPUT_NAMED_FD &&
+ stdio_fdname[STDERR_FILENO] &&
+ streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
+
+ named_iofds[STDERR_FILENO] = p->fds[i];
+ targets--;
+ }
+
+ return targets == 0 ? 0 : -ENOENT;
+}
+
+static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
+ if (!shared)
+ return;
+
+ safe_close_pair(shared->netns_storage_socket);
+ safe_close_pair(shared->ipcns_storage_socket);
+}
+
+static void exec_runtime_close(ExecRuntime *rt) {
+ if (!rt)
+ return;
+
+ safe_close_pair(rt->ephemeral_storage_socket);
+
+ exec_shared_runtime_close(rt->shared);
+ dynamic_creds_close(rt->dynamic_creds);
+}
+
+static void exec_params_close(ExecParameters *p) {
+ if (!p)
+ return;
+
+ p->stdin_fd = safe_close(p->stdin_fd);
+ p->stdout_fd = safe_close(p->stdout_fd);
+ p->stderr_fd = safe_close(p->stderr_fd);
+}
+
+int exec_invoke(
+ const ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ int *exit_status) {
+
+ _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
+ int r, ngids = 0;
+ _cleanup_free_ gid_t *supplementary_gids = NULL;
+ const char *username = NULL, *groupname = NULL;
+ _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
+ const char *home = NULL, *shell = NULL;
+ char **final_argv = NULL;
+ dev_t journal_stream_dev = 0;
+ ino_t journal_stream_ino = 0;
+ bool userns_set_up = false;
+ bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
+ needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
+ needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
+ needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
+ bool keep_seccomp_privileges = false;
+#if HAVE_SELINUX
+ _cleanup_free_ char *mac_selinux_context_net = NULL;
+ bool use_selinux = false;
+#endif
+#if ENABLE_SMACK
+ bool use_smack = false;
+#endif
+#if HAVE_APPARMOR
+ bool use_apparmor = false;
+#endif
+#if HAVE_SECCOMP
+ uint64_t saved_bset = 0;
+#endif
+ uid_t saved_uid = getuid();
+ gid_t saved_gid = getgid();
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+ size_t n_fds, /* fds to pass to the child */
+ n_keep_fds; /* total number of fds not to close */
+ int secure_bits;
+ _cleanup_free_ gid_t *gids_after_pam = NULL;
+ int ngids_after_pam = 0;
+
+ int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
+ size_t n_storage_fds, n_socket_fds;
+
+ assert(command);
+ assert(context);
+ assert(params);
+ assert(exit_status);
+
+ if (context->log_level_max >= 0)
+ log_set_max_level(context->log_level_max);
+
+ /* Explicitly test for CVE-2021-4034 inspired invocations */
+ if (!command->path || strv_isempty(command->argv)) {
+ *exit_status = EXIT_EXEC;
+ return log_exec_error_errno(
+ context,
+ params,
+ SYNTHETIC_ERRNO(EINVAL),
+ "Invalid command line arguments.");
+ }
+
+ LOG_CONTEXT_PUSH_EXEC(context, params);
+
+ if (context->std_input == EXEC_INPUT_SOCKET ||
+ context->std_output == EXEC_OUTPUT_SOCKET ||
+ context->std_error == EXEC_OUTPUT_SOCKET) {
+
+ if (params->n_socket_fds > 1)
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
+
+ if (params->n_socket_fds == 0)
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
+
+ socket_fd = params->fds[0];
+ n_storage_fds = n_socket_fds = 0;
+ } else {
+ n_socket_fds = params->n_socket_fds;
+ n_storage_fds = params->n_storage_fds;
+ }
+ n_fds = n_socket_fds + n_storage_fds;
+
+ r = exec_context_named_iofds(context, params, named_iofds);
+ if (r < 0)
+ return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
+
+ rename_process_from_path(command->path);
+
+ /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
+ * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
+ * both of which will be demoted to SIG_DFL. */
+ (void) default_signals(SIGNALS_CRASH_HANDLER,
+ SIGNALS_IGNORE);
+
+ if (context->ignore_sigpipe)
+ (void) ignore_signals(SIGPIPE);
+
+ r = reset_signal_mask();
+ if (r < 0) {
+ *exit_status = EXIT_SIGNAL_MASK;
+ return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
+ }
+
+ if (params->idle_pipe)
+ do_idle_pipe_dance(params->idle_pipe);
+
+ /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
+ * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
+ * any fds open we don't really want open during the transition. In order to make logging work, we switch the
+ * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
+
+ log_forget_fds();
+ log_set_open_when_needed(true);
+ log_settle_target();
+
+ /* In case anything used libc syslog(), close this here, too */
+ closelog();
+
+ r = collect_open_file_fds(context, params, &n_fds);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
+ }
+
+ int keep_fds[n_fds + 3];
+ memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
+ n_keep_fds = n_fds;
+
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+ }
+
+#if HAVE_LIBBPF
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_outer_map_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+ }
+#endif
+
+ r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
+ }
+
+ if (!context->same_pgrp &&
+ setsid() < 0) {
+ *exit_status = EXIT_SETSID;
+ return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
+ }
+
+ exec_context_tty_reset(context, params);
+
+ if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
+ _cleanup_free_ char *cmdline = NULL;
+
+ cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
+ if (!cmdline) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = ask_for_confirmation(context, params, cmdline);
+ if (r != CONFIRM_EXECUTE) {
+ if (r == CONFIRM_PRETEND_SUCCESS) {
+ *exit_status = EXIT_SUCCESS;
+ return 0;
+ }
+
+ *exit_status = EXIT_CONFIRM;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
+ "Execution cancelled by the user");
+ }
+ }
+
+ /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
+ * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
+ * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
+ * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
+ * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
+ if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
+ setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
+ }
+
+ if (context->dynamic_user && runtime && runtime->dynamic_creds) {
+ _cleanup_strv_free_ char **suggested_paths = NULL;
+
+ /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
+ * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
+ if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
+ }
+
+ r = compile_suggested_paths(context, params, &suggested_paths);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ if (r == -EILSEQ)
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Failed to update dynamic user credentials: User or group with specified name already exists.");
+ return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
+ }
+
+ if (!uid_is_valid(uid)) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
+ }
+
+ if (!gid_is_valid(gid)) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
+ }
+
+ if (runtime->dynamic_creds->user)
+ username = runtime->dynamic_creds->user->name;
+
+ } else {
+ if (context->user) {
+ r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
+ }
+ }
+
+ if (context->group) {
+ r = get_fixed_group(context->group, &groupname, &gid);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
+ }
+ }
+ }
+
+ /* Initialize user supplementary groups and get SupplementaryGroups= ones */
+ r = get_supplementary_groups(context, username, groupname, gid,
+ &supplementary_gids, &ngids);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
+ }
+
+ r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
+ }
+
+ params->user_lookup_fd = safe_close(params->user_lookup_fd);
+
+ r = acquire_home(context, uid, &home, &home_buffer);
+ if (r < 0) {
+ *exit_status = EXIT_CHDIR;
+ return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
+ }
+
+ /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
+ if (socket_fd >= 0)
+ (void) fd_nonblock(socket_fd, false);
+
+ /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
+ * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
+ if (params->cgroup_path) {
+ _cleanup_free_ char *p = NULL;
+
+ r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
+ }
+
+ r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
+ if (r == -EUCLEAN) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
+ "because the cgroup or one of its parents or "
+ "siblings is in the threaded mode: %m", p);
+ }
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
+ }
+ }
+
+ if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
+ if (r < 0) {
+ *exit_status = EXIT_NETWORK;
+ return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
+ }
+ }
+
+ if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
+ }
+ }
+
+ r = setup_input(context, params, socket_fd, named_iofds);
+ if (r < 0) {
+ *exit_status = EXIT_STDIN;
+ return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
+ }
+
+ r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ if (r < 0) {
+ *exit_status = EXIT_STDOUT;
+ return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
+ }
+
+ r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
+ if (r < 0) {
+ *exit_status = EXIT_STDERR;
+ return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
+ }
+
+ if (context->oom_score_adjust_set) {
+ /* When we can't make this change due to EPERM, then let's silently skip over it. User
+ * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
+ r = set_oom_score_adjust(context->oom_score_adjust);
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ log_exec_debug_errno(context, params, r,
+ "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_OOM_ADJUST;
+ return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
+ }
+ }
+
+ if (context->coredump_filter_set) {
+ r = set_coredump_filter(context->coredump_filter);
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
+ }
+ }
+
+ if (context->nice_set) {
+ r = setpriority_closest(context->nice);
+ if (r < 0) {
+ *exit_status = EXIT_NICE;
+ return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
+ }
+ }
+
+ if (context->cpu_sched_set) {
+ struct sched_param param = {
+ .sched_priority = context->cpu_sched_priority,
+ };
+
+ r = sched_setscheduler(0,
+ context->cpu_sched_policy |
+ (context->cpu_sched_reset_on_fork ?
+ SCHED_RESET_ON_FORK : 0),
+ &param);
+ if (r < 0) {
+ *exit_status = EXIT_SETSCHEDULER;
+ return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
+ }
+ }
+
+ if (context->cpu_affinity_from_numa || context->cpu_set.set) {
+ _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
+ const CPUSet *cpu_set;
+
+ if (context->cpu_affinity_from_numa) {
+ r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
+ if (r < 0) {
+ *exit_status = EXIT_CPUAFFINITY;
+ return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
+ }
+
+ cpu_set = &converted_cpu_set;
+ } else
+ cpu_set = &context->cpu_set;
+
+ if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
+ *exit_status = EXIT_CPUAFFINITY;
+ return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
+ }
+ }
+
+ if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
+ r = apply_numa_policy(&context->numa_policy);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
+ else if (r < 0) {
+ *exit_status = EXIT_NUMA_POLICY;
+ return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
+ }
+ }
+
+ if (context->ioprio_set)
+ if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
+ *exit_status = EXIT_IOPRIO;
+ return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
+ }
+
+ if (context->timer_slack_nsec != NSEC_INFINITY)
+ if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
+ *exit_status = EXIT_TIMERSLACK;
+ return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
+ }
+
+ if (context->personality != PERSONALITY_INVALID) {
+ r = safe_personality(context->personality);
+ if (r < 0) {
+ *exit_status = EXIT_PERSONALITY;
+ return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
+ }
+ }
+
+#if ENABLE_UTMP
+ if (context->utmp_id) {
+ _cleanup_free_ char *username_alloc = NULL;
+
+ if (!username && context->utmp_mode == EXEC_UTMP_USER) {
+ username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
+ if (!username_alloc) {
+ *exit_status = EXIT_USER;
+ return log_oom();
+ }
+ }
+
+ const char *line = context->tty_path ?
+ (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
+ NULL;
+ utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
+ line,
+ context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
+ context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
+ USER_PROCESS,
+ username ?: username_alloc);
+ }
+#endif
+
+ if (uid_is_valid(uid)) {
+ r = chown_terminal(STDIN_FILENO, uid);
+ if (r < 0) {
+ *exit_status = EXIT_STDIN;
+ return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
+ }
+ }
+
+ if (params->cgroup_path) {
+ /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
+ * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+ * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+ * touch a single hierarchy too. */
+
+ if (params->flags & EXEC_CGROUP_DELEGATE) {
+ _cleanup_free_ char *p = NULL;
+
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
+ }
+
+ r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
+ }
+ if (r > 0) {
+ r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
+ }
+ }
+ }
+
+ if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
+ if (cgroup_context_want_memory_pressure(cgroup_context)) {
+ r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
+ if (r < 0) {
+ log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
+ memory_pressure_path = mfree(memory_pressure_path);
+ }
+ } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
+ memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
+ if (!memory_pressure_path) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ }
+ }
+ }
+
+ needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
+ if (r < 0)
+ return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
+ }
+
+ if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
+ r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CREDENTIALS;
+ return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
+ }
+ }
+
+ r = build_environment(
+ context,
+ params,
+ cgroup_context,
+ n_fds,
+ home,
+ username,
+ shell,
+ journal_stream_dev,
+ journal_stream_ino,
+ memory_pressure_path,
+ &our_env);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = build_pass_environment(context, &pass_env);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ /* The $PATH variable is set to the default path in params->environment. However, this is overridden
+ * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
+ * not specify PATH but the unit has ExecSearchPath. */
+ if (!strv_isempty(context->exec_search_path)) {
+ _cleanup_free_ char *joined = NULL;
+
+ joined = strv_join(context->exec_search_path, ":");
+ if (!joined) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ }
+
+ accum_env = strv_env_merge(params->environment,
+ our_env,
+ joined_exec_search_path,
+ pass_env,
+ context->environment,
+ params->files_env);
+ if (!accum_env) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+ accum_env = strv_env_clean(accum_env);
+
+ (void) umask(context->umask);
+
+ r = setup_keyring(context, params, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_KEYRING;
+ return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
+ }
+
+ /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+ * from it. */
+ needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
+ /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
+ * for it, and the kernel doesn't actually support ambient caps. */
+ needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
+
+ /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
+ * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
+ * desired. */
+ if (needs_ambient_hack)
+ needs_setuid = false;
+ else
+ needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
+
+ uint64_t capability_ambient_set = context->capability_ambient_set;
+
+ if (needs_sandboxing) {
+ /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
+ * /sys being present. The actual MAC context application will happen later, as late as
+ * possible, to avoid impacting our own code paths. */
+
+#if HAVE_SELINUX
+ use_selinux = mac_selinux_use();
+#endif
+#if ENABLE_SMACK
+ use_smack = mac_smack_use();
+#endif
+#if HAVE_APPARMOR
+ use_apparmor = mac_apparmor_use();
+#endif
+ }
+
+ if (needs_sandboxing) {
+ int which_failed;
+
+ /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
+ * is set here. (See below.) */
+
+ r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
+ if (r < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+ }
+ }
+
+ if (needs_setuid && context->pam_name && username) {
+ /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
+ * wins here. (See above.) */
+
+ /* All fds passed in the fds array will be closed in the pam child process. */
+ r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
+ if (r < 0) {
+ *exit_status = EXIT_PAM;
+ return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
+ }
+
+ if (ambient_capabilities_supported()) {
+ uint64_t ambient_after_pam;
+
+ /* PAM modules might have set some ambient caps. Query them here and merge them into
+ * the caps we want to set in the end, so that we don't end up unsetting them. */
+ r = capability_get_ambient(&ambient_after_pam);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
+ }
+
+ capability_ambient_set |= ambient_after_pam;
+ }
+
+ ngids_after_pam = getgroups_alloc(&gids_after_pam);
+ if (ngids_after_pam < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
+ }
+ }
+
+ if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
+ /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
+ * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
+ * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
+
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
+ * the actual requested operations fail (or silently continue). */
+ if (r < 0 && context->private_users) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
+ }
+ if (r < 0)
+ log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
+ else
+ userns_set_up = true;
+ }
+
+ if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
+
+ /* Try to enable network namespacing if network namespacing is available and we have
+ * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
+ * new network namespace. And if we don't have that, then we could only create a network
+ * namespace without the ability to set up "lo". Hence gracefully skip things then. */
+ if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
+ r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ log_exec_notice_errno(context, params, r,
+ "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_NETWORK;
+ return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
+ }
+ } else if (context->network_namespace_path) {
+ *exit_status = EXIT_NETWORK;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "NetworkNamespacePath= is not supported, refusing.");
+ } else
+ log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
+ }
+
+ if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
+
+ if (ns_type_supported(NAMESPACE_IPC)) {
+ r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
+ if (r == -EPERM)
+ log_exec_warning_errno(context, params, r,
+ "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
+ else if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
+ }
+ } else if (context->ipc_namespace_path) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "IPCNamespacePath= is not supported, refusing.");
+ } else
+ log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
+ }
+
+ if (needs_mount_namespace) {
+ _cleanup_free_ char *error_path = NULL;
+
+ r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
+ error_path ? ": " : "", strempty(error_path));
+ }
+ }
+
+ if (needs_sandboxing) {
+ r = apply_protect_hostname(context, params, exit_status);
+ if (r < 0)
+ return r;
+ }
+
+ if (context->memory_ksm >= 0)
+ if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(errno))
+ log_exec_debug_errno(context,
+ params,
+ errno,
+ "KSM support not available, ignoring.");
+ else {
+ *exit_status = EXIT_KSM;
+ return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
+ }
+ }
+
+ /* Drop groups as early as possible.
+ * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
+ * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
+ if (needs_setuid) {
+ _cleanup_free_ gid_t *gids_to_enforce = NULL;
+ int ngids_to_enforce = 0;
+
+ ngids_to_enforce = merge_gid_lists(supplementary_gids,
+ ngids,
+ gids_after_pam,
+ ngids_after_pam,
+ &gids_to_enforce);
+ if (ngids_to_enforce < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_exec_error_errno(context, params,
+ ngids_to_enforce,
+ "Failed to merge group lists. Group membership might be incorrect: %m");
+ }
+
+ r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
+ if (r < 0) {
+ *exit_status = EXIT_GROUP;
+ return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
+ }
+ }
+
+ /* If the user namespace was not set up above, try to do it now.
+ * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
+ * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
+ * case of mount namespaces being less privileged when the mount point list is copied from a
+ * different user namespace). */
+
+ if (needs_sandboxing && context->private_users && !userns_set_up) {
+ r = setup_private_users(saved_uid, saved_gid, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
+ }
+ }
+
+ /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
+ * shall execute. */
+
+ _cleanup_free_ char *executable = NULL;
+ _cleanup_close_ int executable_fd = -EBADF;
+ r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
+ if (r < 0) {
+ if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ log_exec_struct_errno(context, params, LOG_INFO, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_EXEC_INVOCATION_ID(params),
+ LOG_EXEC_MESSAGE(params,
+ "Executable %s missing, skipping: %m",
+ command->path),
+ "EXECUTABLE=%s", command->path);
+ *exit_status = EXIT_SUCCESS;
+ return 0;
+ }
+
+ *exit_status = EXIT_EXEC;
+ return log_exec_struct_errno(context, params, LOG_INFO, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_EXEC_INVOCATION_ID(params),
+ LOG_EXEC_MESSAGE(params,
+ "Failed to locate executable %s: %m",
+ command->path),
+ "EXECUTABLE=%s", command->path);
+ }
+
+ r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
+ }
+
+#if HAVE_SELINUX
+ if (needs_sandboxing && use_selinux && params->selinux_context_net) {
+ int fd = -EBADF;
+
+ if (socket_fd >= 0)
+ fd = socket_fd;
+ else if (params->n_socket_fds == 1)
+ /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
+ * use context from that fd to compute the label. */
+ fd = params->fds[0];
+
+ if (fd >= 0) {
+ r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
+ if (r < 0) {
+ if (!context->selinux_context_ignore) {
+ *exit_status = EXIT_SELINUX_CONTEXT;
+ return log_exec_error_errno(context,
+ params,
+ r,
+ "Failed to determine SELinux context: %m");
+ }
+ log_exec_debug_errno(context,
+ params,
+ r,
+ "Failed to determine SELinux context, ignoring: %m");
+ }
+ }
+ }
+#endif
+
+ /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
+ * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
+ * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
+ * execve(). But first, close the remaining sockets in the context objects. */
+
+ exec_runtime_close(runtime);
+ exec_params_close(params);
+
+ r = close_all_fds(keep_fds, n_keep_fds);
+ if (r >= 0)
+ r = shift_fds(params->fds, n_fds);
+ if (r >= 0)
+ r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
+ }
+
+ /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
+ * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
+ * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
+ * came this far. */
+
+ secure_bits = context->secure_bits;
+
+ if (needs_sandboxing) {
+ uint64_t bset;
+
+ /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
+ * (Note this is placed after the general resource limit initialization, see above, in order
+ * to take precedence.) */
+ if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
+ if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
+ *exit_status = EXIT_LIMITS;
+ return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
+ }
+ }
+
+#if ENABLE_SMACK
+ /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
+ * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
+ if (use_smack && context->smack_process_label) {
+ r = setup_smack(params, context, executable_fd);
+ if (r < 0 && !context->smack_process_label_ignore) {
+ *exit_status = EXIT_SMACK_PROCESS_LABEL;
+ return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
+ }
+ }
+#endif
+
+ bset = context->capability_bounding_set;
+ /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
+ * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
+ * instead of us doing that */
+ if (needs_ambient_hack)
+ bset |= (UINT64_C(1) << CAP_SETPCAP) |
+ (UINT64_C(1) << CAP_SETUID) |
+ (UINT64_C(1) << CAP_SETGID);
+
+#if HAVE_SECCOMP
+ /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
+ * keep the needed privileges to apply it even if we're not root. */
+ if (needs_setuid &&
+ uid_is_valid(uid) &&
+ context_has_seccomp(context) &&
+ seccomp_allows_drop_privileges(context)) {
+ keep_seccomp_privileges = true;
+
+ if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
+ }
+
+ /* Save the current bounding set so we can restore it after applying the seccomp
+ * filter */
+ saved_bset = bset;
+ bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
+ (UINT64_C(1) << CAP_SETPCAP);
+ }
+#endif
+
+ if (!cap_test_all(bset)) {
+ r = capability_bounding_set_drop(bset, /* right_now= */ false);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
+ }
+ }
+
+ /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
+ * keep-caps set.
+ *
+ * To be able to raise the ambient capabilities after setresuid() they have to be added to
+ * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
+ * the ambient capabilities can be raised as they are present in the permitted and
+ * inhertiable set. However it is possible that someone wants to set ambient capabilities
+ * without changing the user, so we also set the ambient capabilities here.
+ *
+ * The requested ambient capabilities are raised in the inheritable set if the second
+ * argument is true. */
+ if (!needs_ambient_hack) {
+ r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
+ }
+ }
+ }
+
+ /* chroot to root directory first, before we lose the ability to chroot */
+ r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
+ if (r < 0)
+ return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
+
+ if (needs_setuid) {
+ if (uid_is_valid(uid)) {
+ r = enforce_user(context, uid, capability_ambient_set);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
+ }
+
+ if (keep_seccomp_privileges) {
+ if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
+ r = drop_capability(CAP_SETUID);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
+ }
+ }
+
+ r = keep_capability(CAP_SYS_ADMIN);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
+ }
+
+ r = keep_capability(CAP_SETPCAP);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
+ }
+ }
+
+ if (!needs_ambient_hack && capability_ambient_set != 0) {
+
+ /* Raise the ambient capabilities after user change. */
+ r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
+ }
+ }
+ }
+ }
+
+ /* Apply working directory here, because the working directory might be on NFS and only the user running
+ * this service might have the correct privilege to change to the working directory */
+ r = apply_working_directory(context, params, runtime, home, exit_status);
+ if (r < 0)
+ return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
+
+ if (needs_sandboxing) {
+ /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
+ * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
+ * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
+ * are restricted. */
+
+#if HAVE_SELINUX
+ if (use_selinux) {
+ char *exec_context = mac_selinux_context_net ?: context->selinux_context;
+
+ if (exec_context) {
+ r = setexeccon(exec_context);
+ if (r < 0) {
+ if (!context->selinux_context_ignore) {
+ *exit_status = EXIT_SELINUX_CONTEXT;
+ return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
+ }
+ log_exec_debug_errno(context,
+ params,
+ r,
+ "Failed to change SELinux context to %s, ignoring: %m",
+ exec_context);
+ }
+ }
+ }
+#endif
+
+#if HAVE_APPARMOR
+ if (use_apparmor && context->apparmor_profile) {
+ r = aa_change_onexec(context->apparmor_profile);
+ if (r < 0 && !context->apparmor_profile_ignore) {
+ *exit_status = EXIT_APPARMOR_PROFILE;
+ return log_exec_error_errno(context,
+ params,
+ errno,
+ "Failed to prepare AppArmor profile change to %s: %m",
+ context->apparmor_profile);
+ }
+ }
+#endif
+
+ /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
+ * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
+ * requires CAP_SETPCAP. */
+ if (prctl(PR_GET_SECUREBITS) != secure_bits) {
+ /* CAP_SETPCAP is required to set securebits. This capability is raised into the
+ * effective set here.
+ *
+ * The effective set is overwritten during execve() with the following values:
+ *
+ * - ambient set (for non-root processes)
+ *
+ * - (inheritable | bounding) set for root processes)
+ *
+ * Hence there is no security impact to raise it in the effective set before execve
+ */
+ r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
+ }
+ if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
+ *exit_status = EXIT_SECUREBITS;
+ return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
+ }
+ }
+
+ if (context_has_no_new_privileges(context))
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+ *exit_status = EXIT_NO_NEW_PRIVILEGES;
+ return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
+ }
+
+#if HAVE_SECCOMP
+ r = apply_address_families(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_ADDRESS_FAMILIES;
+ return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
+ }
+
+ r = apply_memory_deny_write_execute(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
+ }
+
+ r = apply_restrict_realtime(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
+ }
+
+ r = apply_restrict_suid_sgid(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
+ }
+
+ r = apply_restrict_namespaces(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
+ }
+
+ r = apply_protect_sysctl(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
+ }
+
+ r = apply_protect_kernel_modules(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
+ }
+
+ r = apply_protect_kernel_logs(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
+ }
+
+ r = apply_protect_clock(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
+ }
+
+ r = apply_private_devices(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
+ }
+
+ r = apply_syscall_archs(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
+ }
+
+ r = apply_lock_personality(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
+ }
+
+ r = apply_syscall_log(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
+ }
+#endif
+
+#if HAVE_LIBBPF
+ r = apply_restrict_filesystems(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
+ }
+#endif
+
+#if HAVE_SECCOMP
+ /* This really should remain as close to the execve() as possible, to make sure our own code is affected
+ * by the filter as little as possible. */
+ r = apply_syscall_filter(context, params, needs_ambient_hack);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
+ }
+
+ if (keep_seccomp_privileges) {
+ /* Restore the capability bounding set with what's expected from the service + the
+ * ambient capabilities hack */
+ if (!cap_test_all(saved_bset)) {
+ r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
+ if (r < 0) {
+ *exit_status = EXIT_CAPABILITIES;
+ return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
+ }
+ }
+
+ /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
+ * applications that use it. */
+ if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
+ r = drop_capability(CAP_SYS_ADMIN);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
+ }
+ }
+
+ /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
+ * applications that use it. */
+ if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
+ r = drop_capability(CAP_SETPCAP);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
+ }
+ }
+
+ if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
+ *exit_status = EXIT_USER;
+ return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
+ }
+ }
+#endif
+
+ }
+
+ if (!strv_isempty(context->unset_environment)) {
+ char **ee = NULL;
+
+ ee = strv_env_delete(accum_env, 1, context->unset_environment);
+ if (!ee) {
+ *exit_status = EXIT_MEMORY;
+ return log_oom();
+ }
+
+ strv_free_and_replace(accum_env, ee);
+ }
+
+ if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
+ _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
+
+ r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
+ if (r < 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_exec_error_errno(context,
+ params,
+ r,
+ "Failed to replace environment variables: %m");
+ }
+ final_argv = replaced_argv;
+
+ if (!strv_isempty(unset_variables)) {
+ _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
+ log_exec_warning(context,
+ params,
+ "Referenced but unset environment variable evaluates to an empty string: %s",
+ strna(ju));
+ }
+
+ if (!strv_isempty(bad_variables)) {
+ _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
+ log_exec_warning(context,
+ params,
+ "Invalid environment variable name evaluates to an empty string: %s",
+ strna(jb));
+ }
+ } else
+ final_argv = command->argv;
+
+ log_command_line(context, params, "Executing", executable, final_argv);
+
+ if (params->exec_fd >= 0) {
+ uint8_t hot = 1;
+
+ /* We have finished with all our initializations. Let's now let the manager know that. From this point
+ * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+
+ if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
+ }
+ }
+
+ r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
+
+ if (params->exec_fd >= 0) {
+ uint8_t hot = 0;
+
+ /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
+ * that POLLHUP on it no longer means execve() succeeded. */
+
+ if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
+ }
+ }
+
+ *exit_status = EXIT_EXEC;
+ return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
+}
diff --git a/src/core/exec-invoke.h b/src/core/exec-invoke.h
new file mode 100644
index 0000000..a8a3ac6
--- /dev/null
+++ b/src/core/exec-invoke.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ExecCommand ExecCommand;
+typedef struct ExecContext ExecContext;
+typedef struct ExecParameters ExecParameters;
+typedef struct ExecRuntime ExecRuntime;
+typedef struct CGroupContext CGroupContext;
+
+int exec_invoke(
+ const ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ int *exit_status);
diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c
new file mode 100644
index 0000000..b1e716e
--- /dev/null
+++ b/src/core/execute-serialize.c
@@ -0,0 +1,3896 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "af-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute-serialize.h"
+#include "hexdecoct.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "in-addr-prefix-util.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "serialize.h"
+#include "string-util.h"
+#include "strv.h"
+
+static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
+ _cleanup_free_ char *disable_controllers_str = NULL, *delegate_controllers_str = NULL,
+ *cpuset_cpus = NULL, *cpuset_mems = NULL, *startup_cpuset_cpus = NULL,
+ *startup_cpuset_mems = NULL;
+ char *iface;
+ struct in_addr_prefix *iaai;
+ int r;
+
+ assert(f);
+
+ if (!c)
+ return 0;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-cpu-accounting", c->cpu_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-io-accounting", c->io_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-block-io-accounting", c->blockio_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-memory-accounting", c->memory_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-tasks-accounting", c->tasks_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-ip-accounting", c->ip_accounting);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-memory-oom-group", c->memory_oom_group);
+ if (r < 0)
+ return r;
+
+ if (c->cpu_weight != CGROUP_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-cpu-weight", "%" PRIu64, c->cpu_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_cpu_weight != CGROUP_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-weight", "%" PRIu64, c->startup_cpu_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-cpu-shares", "%" PRIu64, c->cpu_shares);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-cpu-shares", "%" PRIu64, c->startup_cpu_shares);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
+ r = serialize_usec(f, "exec-cgroup-context-cpu-quota-per-sec-usec", c->cpu_quota_per_sec_usec);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->cpu_quota_period_usec != USEC_INFINITY) {
+ r = serialize_usec(f, "exec-cgroup-context-cpu-quota-period-usec", c->cpu_quota_period_usec);
+ if (r < 0)
+ return r;
+ }
+
+ cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
+ if (!cpuset_cpus)
+ return log_oom_debug();
+
+ r = serialize_item(f, "exec-cgroup-context-allowed-cpus", cpuset_cpus);
+ if (r < 0)
+ return r;
+
+ startup_cpuset_cpus = cpu_set_to_range_string(&c->startup_cpuset_cpus);
+ if (!startup_cpuset_cpus)
+ return log_oom_debug();
+
+ r = serialize_item(f, "exec-cgroup-context-startup-allowed-cpus", startup_cpuset_cpus);
+ if (r < 0)
+ return r;
+
+ cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
+ if (!cpuset_mems)
+ return log_oom_debug();
+
+ r = serialize_item(f, "exec-cgroup-context-allowed-memory-nodes", cpuset_mems);
+ if (r < 0)
+ return r;
+
+ startup_cpuset_mems = cpu_set_to_range_string(&c->startup_cpuset_mems);
+ if (!startup_cpuset_mems)
+ return log_oom_debug();
+
+ r = serialize_item(f, "exec-cgroup-context-startup-allowed-memory-nodes", startup_cpuset_mems);
+ if (r < 0)
+ return r;
+
+ if (c->io_weight != CGROUP_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-io-weight", "%" PRIu64, c->io_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_io_weight != CGROUP_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-io-weight", "%" PRIu64, c->startup_io_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-block-io-weight", "%" PRIu64, c->blockio_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-block-io-weight", "%" PRIu64, c->startup_blockio_weight);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->default_memory_min > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-default-memory-min", "%" PRIu64, c->default_memory_min);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->default_memory_low > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-default-memory-low", "%" PRIu64, c->default_memory_low);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_min > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-min", "%" PRIu64, c->memory_min);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_low > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-low", "%" PRIu64, c->memory_low);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_memory_low > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-memory-low", "%" PRIu64, c->startup_memory_low);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_high != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-high", "%" PRIu64, c->memory_high);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_memory_high != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-memory-high", "%" PRIu64, c->startup_memory_high);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-max", "%" PRIu64, c->memory_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_memory_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-memory-max", "%" PRIu64, c->startup_memory_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_swap_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-swap-max", "%" PRIu64, c->memory_swap_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_memory_swap_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-memory-swap-max", "%" PRIu64, c->startup_memory_swap_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_zswap_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-zswap-max", "%" PRIu64, c->memory_zswap_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->startup_memory_zswap_max != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-startup-memory-zswap-max", "%" PRIu64, c->startup_memory_zswap_max);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->memory_limit != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-memory-limit", "%" PRIu64, c->memory_limit);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->tasks_max.value != UINT64_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-tasks-max-value", "%" PRIu64, c->tasks_max.value);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->tasks_max.scale > 0) {
+ r = serialize_item_format(f, "exec-cgroup-context-tasks-max-scale", "%" PRIu64, c->tasks_max.scale);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-min-set", c->default_memory_min_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-default-memory-low-set", c->default_memory_low_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-default-startup-memory-low-set", c->default_startup_memory_low_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-memory-min-set", c->memory_min_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-memory-low-set", c->memory_low_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-low-set", c->startup_memory_low_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-high-set", c->startup_memory_high_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-max-set", c->startup_memory_max_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-swap-max-set", c->startup_memory_swap_max_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-startup-memory-zswap-max-set", c->startup_memory_zswap_max_set);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-device-policy", cgroup_device_policy_to_string(c->device_policy));
+ if (r < 0)
+ return r;
+
+ r = cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-disable-controllers", disable_controllers_str);
+ if (r < 0)
+ return r;
+
+ r = cg_mask_to_string(c->delegate_controllers, &delegate_controllers_str);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-delegate-controllers", delegate_controllers_str);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-delegate", c->delegate);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-managed-oom-swap", managed_oom_mode_to_string(c->moom_swap));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-managed-oom-memory-pressure", managed_oom_mode_to_string(c->moom_mem_pressure));
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-cgroup-context-managed-oom-memory-pressure-limit", "%" PRIu32, c->moom_mem_pressure_limit);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-managed-oom-preference", managed_oom_preference_to_string(c->moom_preference));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-memory-pressure-watch", cgroup_pressure_watch_to_string(c->memory_pressure_watch));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup);
+ if (r < 0)
+ return r;
+
+ if (c->memory_pressure_threshold_usec != USEC_INFINITY) {
+ r = serialize_usec(f, "exec-cgroup-context-memory-pressure-threshold-usec", c->memory_pressure_threshold_usec);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_allow, a, c->device_allow) {
+ r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s",
+ a->path,
+ cgroup_device_permissions_to_string(a->permissions));
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_weights, iw, c->io_device_weights) {
+ r = serialize_item_format(f, "exec-cgroup-context-io-device-weight", "%s %" PRIu64,
+ iw->path,
+ iw->weight);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_latencies, l, c->io_device_latencies) {
+ r = serialize_item_format(f, "exec-cgroup-context-io-device-latency-target-usec", "%s " USEC_FMT,
+ l->path,
+ l->target_usec);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_limits, il, c->io_device_limits)
+ for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) {
+ _cleanup_free_ char *key = NULL;
+
+ if (il->limits[type] == cgroup_io_limit_defaults[type])
+ continue;
+
+ key = strjoin("exec-cgroup-context-io-device-limit-",
+ cgroup_io_limit_type_to_string(type));
+ if (!key)
+ return -ENOMEM;
+
+ r = serialize_item_format(f, key, "%s %" PRIu64, il->path, il->limits[type]);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
+ r = serialize_item_format(f, "exec-cgroup-context-blockio-device-weight", "%s %" PRIu64,
+ w->path,
+ w->weight);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
+ if (b->rbps != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-blockio-read-bandwidth", "%s %" PRIu64,
+ b->path,
+ b->rbps);
+ if (r < 0)
+ return r;
+ }
+ if (b->wbps != CGROUP_LIMIT_MAX) {
+ r = serialize_item_format(f, "exec-cgroup-context-blockio-write-bandwidth", "%s %" PRIu64,
+ b->path,
+ b->wbps);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ SET_FOREACH(iaai, c->ip_address_allow) {
+ r = serialize_item(f,
+ "exec-cgroup-context-ip-address-allow",
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+ if (r < 0)
+ return r;
+ }
+ SET_FOREACH(iaai, c->ip_address_deny) {
+ r = serialize_item(f,
+ "exec-cgroup-context-ip-address-deny",
+ IN_ADDR_PREFIX_TO_STRING(iaai->family, &iaai->address, iaai->prefixlen));
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-allow-reduced", c->ip_address_allow_reduced);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-cgroup-context-ip-address-deny-reduced", c->ip_address_deny_reduced);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-cgroup-context-ip-ingress-filter-path=", c->ip_filters_ingress);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-cgroup-context-ip-egress-filter-path=", c->ip_filters_egress);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(programs, p, c->bpf_foreign_programs) {
+ r = serialize_item_format(f, "exec-cgroup-context-bpf-program", "%" PRIu32 " %s",
+ p->attach_type,
+ p->bpffs_path);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(socket_bind_items, bi, c->socket_bind_allow) {
+ fprintf(f, "exec-cgroup-context-socket-bind-allow=");
+ cgroup_context_dump_socket_bind_item(bi, f);
+ fputc('\n', f);
+ }
+
+ LIST_FOREACH(socket_bind_items, bi, c->socket_bind_deny) {
+ fprintf(f, "exec-cgroup-context-socket-bind-deny=");
+ cgroup_context_dump_socket_bind_item(bi, f);
+ fputc('\n', f);
+ }
+
+ SET_FOREACH(iface, c->restrict_network_interfaces) {
+ r = serialize_item(f, "exec-cgroup-context-restrict-network-interfaces", iface);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(
+ f,
+ "exec-cgroup-context-restrict-network-interfaces-is-allow-list",
+ c->restrict_network_interfaces_is_allow_list);
+ if (r < 0)
+ return r;
+
+ fputc('\n', f); /* End marker */
+
+ return 0;
+}
+
+static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
+ int r;
+
+ assert(f);
+
+ if (!c)
+ return 0;
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "exec-cgroup-context-cpu-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->cpu_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-io-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->io_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-block-io-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->blockio_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-tasks-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->tasks_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-accounting="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->ip_accounting = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-oom-group="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_oom_group = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-cpu-weight="))) {
+ r = safe_atou64(val, &c->cpu_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-weight="))) {
+ r = safe_atou64(val, &c->startup_cpu_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-cpu-shares="))) {
+ r = safe_atou64(val, &c->cpu_shares);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-cpu-shares="))) {
+ r = safe_atou64(val, &c->startup_cpu_shares);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-per-sec-usec="))) {
+ r = deserialize_usec(val, &c->cpu_quota_per_sec_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-cpu-quota-period-usec="))) {
+ r = deserialize_usec(val, &c->cpu_quota_period_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-allowed-cpus="))) {
+ if (c->cpuset_cpus.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set_full(
+ val,
+ &c->cpuset_cpus,
+ /* warn= */ false,
+ /* unit= */ NULL,
+ /* filename= */ NULL,
+ /* line= */ 0,
+ /* lvalue= */ NULL);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-cpus="))) {
+ if (c->startup_cpuset_cpus.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set_full(
+ val,
+ &c->startup_cpuset_cpus,
+ /* warn= */ false,
+ /* unit= */ NULL,
+ /* filename= */ NULL,
+ /* line= */ 0,
+ /* lvalue= */ NULL);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-allowed-memory-nodes="))) {
+ if (c->cpuset_mems.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set_full(
+ val,
+ &c->cpuset_mems,
+ /* warn= */ false,
+ /* unit= */ NULL,
+ /* filename= */ NULL,
+ /* line= */ 0,
+ /* lvalue= */ NULL);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-allowed-memory-nodes="))) {
+ if (c->startup_cpuset_mems.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set_full(
+ val,
+ &c->startup_cpuset_mems,
+ /* warn= */ false,
+ /* unit= */ NULL,
+ /* filename= */ NULL,
+ /* line= */ 0,
+ /* lvalue= */ NULL);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-io-weight="))) {
+ r = safe_atou64(val, &c->io_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-io-weight="))) {
+ r = safe_atou64(val, &c->startup_io_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-block-io-weight="))) {
+ r = safe_atou64(val, &c->blockio_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-block-io-weight="))) {
+ r = safe_atou64(val, &c->startup_blockio_weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min="))) {
+ r = safe_atou64(val, &c->default_memory_min);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low="))) {
+ r = safe_atou64(val, &c->default_memory_low);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-min="))) {
+ r = safe_atou64(val, &c->memory_min);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-low="))) {
+ r = safe_atou64(val, &c->memory_low);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low="))) {
+ r = safe_atou64(val, &c->startup_memory_low);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-high="))) {
+ r = safe_atou64(val, &c->memory_high);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high="))) {
+ r = safe_atou64(val, &c->startup_memory_high);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-max="))) {
+ r = safe_atou64(val, &c->memory_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max="))) {
+ r = safe_atou64(val, &c->startup_memory_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-swap-max="))) {
+ r = safe_atou64(val, &c->memory_swap_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max="))) {
+ r = safe_atou64(val, &c->startup_memory_swap_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-zswap-max="))) {
+ r = safe_atou64(val, &c->memory_zswap_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max="))) {
+ r = safe_atou64(val, &c->startup_memory_zswap_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-limit="))) {
+ r = safe_atou64(val, &c->memory_limit);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-value="))) {
+ r = safe_atou64(val, &c->tasks_max.value);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-tasks-max-scale="))) {
+ r = safe_atou64(val, &c->tasks_max.scale);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-default-memory-min-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->default_memory_min_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-default-memory-low-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->default_memory_low_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-default-startup-memory-low-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->default_startup_memory_low_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-min-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_min_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-low-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_low_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-low-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->startup_memory_low_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-high-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->startup_memory_high_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-max-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->startup_memory_max_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-swap-max-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->startup_memory_swap_max_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-startup-memory-zswap-max-set="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->startup_memory_zswap_max_set = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-device-policy="))) {
+ c->device_policy = cgroup_device_policy_from_string(val);
+ if (c->device_policy < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-cgroup-context-disable-controllers="))) {
+ r = cg_mask_from_string(val, &c->disable_controllers);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-delegate-controllers="))) {
+ r = cg_mask_from_string(val, &c->delegate_controllers);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-delegate="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->delegate = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-swap="))) {
+ c->moom_swap = managed_oom_mode_from_string(val);
+ if (c->moom_swap < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure="))) {
+ c->moom_mem_pressure = managed_oom_mode_from_string(val);
+ if (c->moom_mem_pressure < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure-limit="))) {
+ r = safe_atou32(val, &c->moom_mem_pressure_limit);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-managed-oom-preference="))) {
+ c->moom_preference = managed_oom_preference_from_string(val);
+ if (c->moom_preference < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) {
+ c->memory_pressure_watch = cgroup_pressure_watch_from_string(val);
+ if (c->memory_pressure_watch < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) {
+ r = free_and_strdup(&c->delegate_subgroup, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-threshold-usec="))) {
+ r = deserialize_usec(val, &c->memory_pressure_threshold_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) {
+ _cleanup_free_ char *path = NULL, *rwm = NULL;
+ CGroupDevicePermissions p;
+
+ r = extract_many_words(&val, " ", 0, &path, &rwm, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ p = isempty(rwm) ? 0 : cgroup_device_permissions_from_string(rwm);
+ if (p < 0)
+ return p;
+
+ r = cgroup_context_add_or_update_device_allow(c, path, p);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-io-device-weight="))) {
+ _cleanup_free_ char *path = NULL, *weight = NULL;
+ CGroupIODeviceWeight *a = NULL;
+
+ r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ LIST_FOREACH(device_weights, b, c->io_device_weights)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceWeight, 1);
+ if (!a)
+ return log_oom_debug();
+
+ a->path = TAKE_PTR(path);
+
+ LIST_PREPEND(device_weights, c->io_device_weights, a);
+ }
+
+ r = safe_atou64(weight, &a->weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-io-device-latency-target-usec="))) {
+ _cleanup_free_ char *path = NULL, *target = NULL;
+ CGroupIODeviceLatency *a = NULL;
+
+ r = extract_many_words(&val, " ", 0, &path, &target, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ LIST_FOREACH(device_latencies, b, c->io_device_latencies)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupIODeviceLatency, 1);
+ if (!a)
+ return log_oom_debug();
+
+ a->path = TAKE_PTR(path);
+
+ LIST_PREPEND(device_latencies, c->io_device_latencies, a);
+ }
+
+ r = deserialize_usec(target, &a->target_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-io-device-limit-"))) {
+ _cleanup_free_ char *type = NULL, *path = NULL, *limits = NULL;
+ CGroupIODeviceLimit *limit = NULL;
+ CGroupIOLimitType t;
+
+ r = extract_many_words(&val, "= ", 0, &type, &path, &limits, NULL);
+ if (r < 0)
+ return r;
+ if (r != 3)
+ return -EINVAL;
+
+ t = cgroup_io_limit_type_from_string(type);
+ if (t < 0)
+ return t;
+
+ LIST_FOREACH(device_limits, i, c->io_device_limits)
+ if (path_equal(path, i->path)) {
+ limit = i;
+ break;
+ }
+
+ if (!limit) {
+ limit = new0(CGroupIODeviceLimit, 1);
+ if (!limit)
+ return log_oom_debug();
+
+ limit->path = TAKE_PTR(path);
+ for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++)
+ limit->limits[i] = cgroup_io_limit_defaults[i];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, limit);
+ }
+
+ r = safe_atou64(limits, &limit->limits[t]);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-block-io-device-weight="))) {
+ _cleanup_free_ char *path = NULL, *weight = NULL;
+ CGroupBlockIODeviceWeight *a = NULL;
+
+ r = extract_many_words(&val, " ", 0, &path, &weight, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ a = new0(CGroupBlockIODeviceWeight, 1);
+ if (!a)
+ return log_oom_debug();
+
+ a->path = TAKE_PTR(path);
+
+ LIST_PREPEND(device_weights, c->blockio_device_weights, a);
+
+ r = safe_atou64(weight, &a->weight);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-block-io-read-bandwidth="))) {
+ _cleanup_free_ char *path = NULL, *bw = NULL;
+ CGroupBlockIODeviceBandwidth *a = NULL;
+
+ r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!a)
+ return log_oom_debug();
+
+ a->path = TAKE_PTR(path);
+ a->wbps = CGROUP_LIMIT_MAX;
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+ }
+
+ r = safe_atou64(bw, &a->rbps);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-block-io-write-bandwidth="))) {
+ _cleanup_free_ char *path = NULL, *bw = NULL;
+ CGroupBlockIODeviceBandwidth *a = NULL;
+
+ r = extract_many_words(&val, " ", 0, &path, &bw, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
+ if (path_equal(b->path, path)) {
+ a = b;
+ break;
+ }
+
+ if (!a) {
+ a = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!a)
+ return log_oom_debug();
+
+ a->path = TAKE_PTR(path);
+ a->rbps = CGROUP_LIMIT_MAX;
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, a);
+ }
+
+ r = safe_atou64(bw, &a->wbps);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow="))) {
+ struct in_addr_prefix a;
+
+ r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen);
+ if (r < 0)
+ return r;
+
+ r = in_addr_prefix_add(&c->ip_address_allow, &a);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny="))) {
+ struct in_addr_prefix a;
+
+ r = in_addr_prefix_from_string_auto(val, &a.family, &a.address, &a.prefixlen);
+ if (r < 0)
+ return r;
+
+ r = in_addr_prefix_add(&c->ip_address_deny, &a);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-address-allow-reduced="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->ip_address_allow_reduced = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-address-deny-reduced="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->ip_address_deny_reduced = r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-ingress-filter-path="))) {
+ r = deserialize_strv(val, &c->ip_filters_ingress);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-ip-egress-filter-path="))) {
+ r = deserialize_strv(val, &c->ip_filters_egress);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-bpf-program="))) {
+ _cleanup_free_ char *type = NULL, *path = NULL;
+ uint32_t t;
+
+ r = extract_many_words(&val, " ", 0, &type, &path, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ return -EINVAL;
+
+ r = safe_atou32(type, &t);
+ if (r < 0)
+ return r;
+
+ r = cgroup_context_add_bpf_foreign_program(c, t, path);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-allow="))) {
+ CGroupSocketBindItem *item;
+ uint16_t nr_ports, port_min;
+ int af, ip_protocol;
+
+ r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min);
+ if (r < 0)
+ return r;
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom_debug();
+ *item = (CGroupSocketBindItem) {
+ .address_family = af,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min,
+ };
+
+ LIST_PREPEND(socket_bind_items, c->socket_bind_allow, item);
+ } else if ((val = startswith(l, "exec-cgroup-context-socket-bind-deny="))) {
+ CGroupSocketBindItem *item;
+ uint16_t nr_ports, port_min;
+ int af, ip_protocol;
+
+ r = parse_socket_bind_item(val, &af, &ip_protocol, &nr_ports, &port_min);
+ if (r < 0)
+ return r;
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom_debug();
+ *item = (CGroupSocketBindItem) {
+ .address_family = af,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min,
+ };
+
+ LIST_PREPEND(socket_bind_items, c->socket_bind_deny, item);
+ } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces="))) {
+ r = set_ensure_allocated(&c->restrict_network_interfaces, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = set_put_strdup(&c->restrict_network_interfaces, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-cgroup-context-restrict-network-interfaces-is-allow-list="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->restrict_network_interfaces_is_allow_list = r;
+ } else
+ log_warning("Failed to parse serialized line, ignoring: %s", l);
+ }
+
+ return 0;
+}
+
+static int exec_runtime_serialize(const ExecRuntime *rt, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(f);
+ assert(fds);
+
+ if (!rt) {
+ fputc('\n', f); /* End marker */
+ return 0;
+ }
+
+ if (rt->shared) {
+ r = serialize_item(f, "exec-runtime-id", rt->shared->id);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-runtime-tmp-dir", rt->shared->tmp_dir);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-runtime-var-tmp-dir", rt->shared->var_tmp_dir);
+ if (r < 0)
+ return r;
+
+ if (rt->shared->netns_storage_socket[0] >= 0 && rt->shared->netns_storage_socket[1] >= 0) {
+ r = serialize_fd_many(f, fds, "exec-runtime-netns-storage-socket", rt->shared->netns_storage_socket, 2);
+ if (r < 0)
+ return r;
+ }
+
+ if (rt->shared->ipcns_storage_socket[0] >= 0 && rt->shared->ipcns_storage_socket[1] >= 0) {
+ r = serialize_fd_many(f, fds, "exec-runtime-ipcns-storage-socket", rt->shared->ipcns_storage_socket, 2);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (rt->dynamic_creds) {
+ r = dynamic_user_serialize_one(rt->dynamic_creds->user, "exec-runtime-dynamic-creds-user", f, fds);
+ if (r < 0)
+ return r;
+ }
+
+ if (rt->dynamic_creds && rt->dynamic_creds->group && rt->dynamic_creds->group == rt->dynamic_creds->user) {
+ r = serialize_bool(f, "exec-runtime-dynamic-creds-group-copy", true);
+ if (r < 0)
+ return r;
+ } else if (rt->dynamic_creds) {
+ r = dynamic_user_serialize_one(rt->dynamic_creds->group, "exec-runtime-dynamic-creds-group", f, fds);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-runtime-ephemeral-copy", rt->ephemeral_copy);
+ if (r < 0)
+ return r;
+
+ if (rt->ephemeral_storage_socket[0] >= 0 && rt->ephemeral_storage_socket[1] >= 0) {
+ r = serialize_fd_many(f, fds, "exec-runtime-ephemeral-storage-socket", rt->ephemeral_storage_socket, 2);
+ if (r < 0)
+ return r;
+ }
+
+ fputc('\n', f); /* End marker */
+
+ return 0;
+}
+
+static int exec_runtime_deserialize(ExecRuntime *rt, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(rt);
+ assert(rt->shared);
+ assert(rt->dynamic_creds);
+ assert(f);
+ assert(fds);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "exec-runtime-id="))) {
+ r = free_and_strdup(&rt->shared->id, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-runtime-tmp-dir="))) {
+ r = free_and_strdup(&rt->shared->tmp_dir, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-runtime-var-tmp-dir="))) {
+ r = free_and_strdup(&rt->shared->var_tmp_dir, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-runtime-netns-storage-socket="))) {
+
+ r = deserialize_fd_many(fds, val, 2, rt->shared->netns_storage_socket);
+ if (r < 0)
+ continue;
+
+ } else if ((val = startswith(l, "exec-runtime-ipcns-storage-socket="))) {
+
+ r = deserialize_fd_many(fds, val, 2, rt->shared->ipcns_storage_socket);
+ if (r < 0)
+ continue;
+
+ } else if ((val = startswith(l, "exec-runtime-dynamic-creds-user=")))
+ dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->user);
+ else if ((val = startswith(l, "exec-runtime-dynamic-creds-group=")))
+ dynamic_user_deserialize_one(/* m= */ NULL, val, fds, &rt->dynamic_creds->group);
+ else if ((val = startswith(l, "exec-runtime-dynamic-creds-group-copy="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ if (!r)
+ continue; /* Nothing to do */
+
+ if (!rt->dynamic_creds->user)
+ return -EINVAL;
+
+ rt->dynamic_creds->group = dynamic_user_ref(rt->dynamic_creds->user);
+ } else if ((val = startswith(l, "exec-runtime-ephemeral-copy="))) {
+ r = free_and_strdup(&rt->ephemeral_copy, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-runtime-ephemeral-storage-socket="))) {
+
+ r = deserialize_fd_many(fds, val, 2, rt->ephemeral_storage_socket);
+ if (r < 0)
+ continue;
+ } else
+ log_warning("Failed to parse serialized line, ignoring: %s", l);
+ }
+
+ return 0;
+}
+
+static bool exec_parameters_is_idle_pipe_set(const ExecParameters *p) {
+ assert(p);
+
+ return p->idle_pipe &&
+ p->idle_pipe[0] >= 0 &&
+ p->idle_pipe[1] >= 0 &&
+ p->idle_pipe[2] >= 0 &&
+ p->idle_pipe[3] >= 0;
+}
+
+static int exec_parameters_serialize(const ExecParameters *p, const ExecContext *c, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(f);
+ assert(fds);
+
+ if (!p)
+ return 0;
+
+ r = serialize_item(f, "exec-parameters-runtime-scope", runtime_scope_to_string(p->runtime_scope));
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-parameters-environment", p->environment);
+ if (r < 0)
+ return r;
+
+ if (p->fds) {
+ if (p->n_socket_fds > 0) {
+ r = serialize_item_format(f, "exec-parameters-n-socket-fds", "%zu", p->n_socket_fds);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->n_storage_fds > 0) {
+ r = serialize_item_format(f, "exec-parameters-n-storage-fds", "%zu", p->n_storage_fds);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_fd_many(f, fds, "exec-parameters-fds", p->fds, p->n_socket_fds + p->n_storage_fds);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_strv(f, "exec-parameters-fd-names", p->fd_names);
+ if (r < 0)
+ return r;
+
+ if (p->flags != 0) {
+ r = serialize_item_format(f, "exec-parameters-flags", "%u", (unsigned) p->flags);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-parameters-selinux-context-net", p->selinux_context_net);
+ if (r < 0)
+ return r;
+
+ if (p->cgroup_supported != 0) {
+ r = serialize_item_format(f, "exec-parameters-cgroup-supported", "%u", (unsigned) p->cgroup_supported);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-parameters-cgroup-path", p->cgroup_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-parameters-cgroup-id", "%" PRIu64, p->cgroup_id);
+ if (r < 0)
+ return r;
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ _cleanup_free_ char *key = NULL;
+
+ key = strjoin("exec-parameters-prefix-directories-", exec_directory_type_to_string(dt));
+ if (!key)
+ return log_oom_debug();
+
+ /* Always serialize, even an empty prefix, as this is a fixed array and we always expect
+ * to have all elements (unless fuzzing is happening, hence the NULL check). */
+ r = serialize_item(f, key, strempty(p->prefix ? p->prefix[dt] : NULL));
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-parameters-received-credentials-directory", p->received_credentials_directory);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-parameters-received-encrypted-credentials-directory", p->received_encrypted_credentials_directory);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-parameters-confirm-spawn", p->confirm_spawn);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-parameters-shall-confirm-spawn", p->shall_confirm_spawn);
+ if (r < 0)
+ return r;
+
+ if (p->watchdog_usec > 0) {
+ r = serialize_usec(f, "exec-parameters-watchdog-usec", p->watchdog_usec);
+ if (r < 0)
+ return r;
+ }
+
+ if (exec_parameters_is_idle_pipe_set(p)) {
+ r = serialize_fd_many(f, fds, "exec-parameters-idle-pipe", p->idle_pipe, 4);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_fd(f, fds, "exec-parameters-stdin-fd", p->stdin_fd);
+ if (r < 0)
+ return r;
+
+ r = serialize_fd(f, fds, "exec-parameters-stdout-fd", p->stdout_fd);
+ if (r < 0)
+ return r;
+
+ r = serialize_fd(f, fds, "exec-parameters-stderr-fd", p->stderr_fd);
+ if (r < 0)
+ return r;
+
+ r = serialize_fd(f, fds, "exec-parameters-exec-fd", p->exec_fd);
+ if (r < 0)
+ return r;
+
+ if (c && exec_context_restrict_filesystems_set(c)) {
+ r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_outer_map_fd);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-parameters-notify-socket", p->notify_socket);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(open_files, file, p->open_files) {
+ _cleanup_free_ char *ofs = NULL;
+
+ r = open_file_to_string(file, &ofs);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-parameters-open-file", ofs);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-parameters-fallback-smack-process-label", p->fallback_smack_process_label);
+ if (r < 0)
+ return r;
+
+ r = serialize_fd(f, fds, "exec-parameters-user-lookup-fd", p->user_lookup_fd);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-parameters-files-env", p->files_env);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-parameters-unit-id", p->unit_id);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-parameters-invocation-id-string", p->invocation_id_string);
+ if (r < 0)
+ return r;
+
+ fputc('\n', f); /* End marker */
+
+ return 0;
+}
+
+static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
+ int r, nr_open;
+
+ assert(p);
+ assert(f);
+ assert(fds);
+
+ nr_open = read_nr_open();
+ if (nr_open < 3)
+ nr_open = HIGH_RLIMIT_NOFILE;
+ assert(nr_open > 0); /* For compilers/static analyzers */
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "exec-parameters-runtime-scope="))) {
+ p->runtime_scope = runtime_scope_from_string(val);
+ if (p->runtime_scope < 0)
+ return p->runtime_scope;
+ } else if ((val = startswith(l, "exec-parameters-environment="))) {
+ r = deserialize_strv(val, &p->environment);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-n-socket-fds="))) {
+ if (p->fds)
+ return -EINVAL; /* Already received */
+
+ r = safe_atozu(val, &p->n_socket_fds);
+ if (r < 0)
+ return r;
+
+ if (p->n_socket_fds > (size_t) nr_open)
+ return -EINVAL; /* too many, someone is playing games with us */
+ } else if ((val = startswith(l, "exec-parameters-n-storage-fds="))) {
+ if (p->fds)
+ return -EINVAL; /* Already received */
+
+ r = safe_atozu(val, &p->n_storage_fds);
+ if (r < 0)
+ return r;
+
+ if (p->n_storage_fds > (size_t) nr_open)
+ return -EINVAL; /* too many, someone is playing games with us */
+ } else if ((val = startswith(l, "exec-parameters-fds="))) {
+ if (p->n_socket_fds + p->n_storage_fds == 0)
+ return log_warning_errno(
+ SYNTHETIC_ERRNO(EINVAL),
+ "Got exec-parameters-fds= without "
+ "prior exec-parameters-n-socket-fds= or exec-parameters-n-storage-fds=");
+ if (p->n_socket_fds + p->n_storage_fds > (size_t) nr_open)
+ return -EINVAL; /* too many, someone is playing games with us */
+
+ if (p->fds)
+ return -EINVAL; /* duplicated */
+
+ p->fds = new(int, p->n_socket_fds + p->n_storage_fds);
+ if (!p->fds)
+ return log_oom_debug();
+
+ /* Ensure we don't leave any FD uninitialized on error, it makes the fuzzer sad */
+ for (size_t i = 0; i < p->n_socket_fds + p->n_storage_fds; ++i)
+ p->fds[i] = -EBADF;
+
+ r = deserialize_fd_many(fds, val, p->n_socket_fds + p->n_storage_fds, p->fds);
+ if (r < 0)
+ continue;
+
+ } else if ((val = startswith(l, "exec-parameters-fd-names="))) {
+ r = deserialize_strv(val, &p->fd_names);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-flags="))) {
+ unsigned flags;
+
+ r = safe_atou(val, &flags);
+ if (r < 0)
+ return r;
+ p->flags = flags;
+ } else if ((val = startswith(l, "exec-parameters-selinux-context-net="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+
+ p->selinux_context_net = r;
+ } else if ((val = startswith(l, "exec-parameters-cgroup-supported="))) {
+ unsigned cgroup_supported;
+
+ r = safe_atou(val, &cgroup_supported);
+ if (r < 0)
+ return r;
+ p->cgroup_supported = cgroup_supported;
+ } else if ((val = startswith(l, "exec-parameters-cgroup-path="))) {
+ r = free_and_strdup(&p->cgroup_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-cgroup-id="))) {
+ r = safe_atou64(val, &p->cgroup_id);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-prefix-directories-"))) {
+ _cleanup_free_ char *type = NULL, *prefix = NULL;
+ ExecDirectoryType dt;
+
+ r = extract_many_words(&val, "= ", 0, &type, &prefix, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ dt = exec_directory_type_from_string(type);
+ if (dt < 0)
+ return -EINVAL;
+
+ if (!p->prefix) {
+ p->prefix = new0(char*, _EXEC_DIRECTORY_TYPE_MAX+1);
+ if (!p->prefix)
+ return log_oom_debug();
+ }
+
+ if (isempty(prefix))
+ p->prefix[dt] = mfree(p->prefix[dt]);
+ else
+ free_and_replace(p->prefix[dt], prefix);
+ } else if ((val = startswith(l, "exec-parameters-received-credentials-directory="))) {
+ r = free_and_strdup(&p->received_credentials_directory, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-received-encrypted-credentials-directory="))) {
+ r = free_and_strdup(&p->received_encrypted_credentials_directory, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-confirm-spawn="))) {
+ r = free_and_strdup(&p->confirm_spawn, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-shall-confirm-spawn="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+
+ p->shall_confirm_spawn = r;
+ } else if ((val = startswith(l, "exec-parameters-watchdog-usec="))) {
+ r = deserialize_usec(val, &p->watchdog_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-idle-pipe="))) {
+ if (p->idle_pipe)
+ return -EINVAL; /* duplicated */
+
+ p->idle_pipe = new(int, 4);
+ if (!p->idle_pipe)
+ return log_oom_debug();
+
+ p->idle_pipe[0] = p->idle_pipe[1] = p->idle_pipe[2] = p->idle_pipe[3] = -EBADF;
+
+ r = deserialize_fd_many(fds, val, 4, p->idle_pipe);
+ if (r < 0)
+ continue;
+
+ } else if ((val = startswith(l, "exec-parameters-stdin-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->stdin_fd = fd;
+
+ } else if ((val = startswith(l, "exec-parameters-stdout-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->stdout_fd = fd;
+
+ } else if ((val = startswith(l, "exec-parameters-stderr-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->stderr_fd = fd;
+ } else if ((val = startswith(l, "exec-parameters-exec-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->exec_fd = fd;
+ } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->bpf_outer_map_fd = fd;
+ } else if ((val = startswith(l, "exec-parameters-notify-socket="))) {
+ r = free_and_strdup(&p->notify_socket, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-open-file="))) {
+ OpenFile *of = NULL;
+
+ r = open_file_parse(val, &of);
+ if (r < 0)
+ return r;
+
+ LIST_APPEND(open_files, p->open_files, of);
+ } else if ((val = startswith(l, "exec-parameters-fallback-smack-process-label="))) {
+ r = free_and_strdup(&p->fallback_smack_process_label, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-user-lookup-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ p->user_lookup_fd = fd;
+ } else if ((val = startswith(l, "exec-parameters-files-env="))) {
+ r = deserialize_strv(val, &p->files_env);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-unit-id="))) {
+ r = free_and_strdup(&p->unit_id, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-parameters-invocation-id-string="))) {
+ if (strlen(val) > SD_ID128_STRING_MAX - 1)
+ return -EINVAL;
+
+ r = sd_id128_from_string(val, &p->invocation_id);
+ if (r < 0)
+ return r;
+
+ sd_id128_to_string(p->invocation_id, p->invocation_id_string);
+ } else
+ log_warning("Failed to parse serialized line, ignoring: %s", l);
+ }
+
+ /* Bail out if we got exec-parameters-n-{socket/storage}-fds= but no corresponding
+ * exec-parameters-fds= */
+ if (p->n_socket_fds + p->n_storage_fds > 0 && !p->fds)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int serialize_std_out_err(const ExecContext *c, FILE *f, int fileno) {
+ char *key, *value;
+ const char *type;
+
+ assert(c);
+ assert(f);
+ assert(IN_SET(fileno, STDOUT_FILENO, STDERR_FILENO));
+
+ type = fileno == STDOUT_FILENO ? "output" : "error";
+
+ switch (fileno == STDOUT_FILENO ? c->std_output : c->std_error) {
+ case EXEC_OUTPUT_NAMED_FD:
+ key = strjoina("exec-context-std-", type, "-fd-name");
+ value = c->stdio_fdname[fileno];
+
+ break;
+
+ case EXEC_OUTPUT_FILE:
+ key = strjoina("exec-context-std-", type, "-file");
+ value = c->stdio_file[fileno];
+
+ break;
+
+ case EXEC_OUTPUT_FILE_APPEND:
+ key = strjoina("exec-context-std-", type, "-file-append");
+ value = c->stdio_file[fileno];
+
+ break;
+
+ case EXEC_OUTPUT_FILE_TRUNCATE:
+ key = strjoina("exec-context-std-", type, "-file-truncate");
+ value = c->stdio_file[fileno];
+
+ break;
+
+ default:
+ return 0;
+ }
+
+ return serialize_item(f, key, value);
+}
+
+static int exec_context_serialize(const ExecContext *c, FILE *f) {
+ int r;
+
+ assert(f);
+
+ if (!c)
+ return 0;
+
+ r = serialize_strv(f, "exec-context-environment", c->environment);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-environment-files", c->environment_files);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-pass-environment", c->pass_environment);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-unset-environment", c->unset_environment);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-working-directory", c->working_directory);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-root-directory", c->root_directory);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-root-image", c->root_image);
+ if (r < 0)
+ return r;
+
+ if (c->root_image_options) {
+ _cleanup_free_ char *options = NULL;
+
+ LIST_FOREACH(mount_options, o, c->root_image_options) {
+ if (isempty(o->options))
+ continue;
+
+ _cleanup_free_ char *escaped = NULL;
+ escaped = shell_escape(o->options, ":");
+ if (!escaped)
+ return log_oom_debug();
+
+ if (!strextend(&options,
+ " ",
+ partition_designator_to_string(o->partition_designator),
+ ":",
+ escaped))
+ return log_oom_debug();
+ }
+
+ r = serialize_item(f, "exec-context-root-image-options", options);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-context-root-verity", c->root_verity);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-root-hash-path", c->root_hash_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-root-hash-sig-path", c->root_hash_sig_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_hexmem(f, "exec-context-root-hash", c->root_hash, c->root_hash_size);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_base64mem(f, "exec-context-root-hash-sig", c->root_hash_sig, c->root_hash_sig_size);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-root-ephemeral", c->root_ephemeral);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-umask", "%04o", c->umask);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-non-blocking", c->non_blocking);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_tristate(f, "exec-context-private-mounts", c->private_mounts);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_tristate(f, "exec-context-memory-ksm", c->memory_ksm);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-private-tmp", c->private_tmp);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-private-devices", c->private_devices);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-kernel-tunables", c->protect_kernel_tunables);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-kernel-modules", c->protect_kernel_modules);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-kernel-logs", c->protect_kernel_logs);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-clock", c->protect_clock);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-control-groups", c->protect_control_groups);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-private-network", c->private_network);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-private-users", c->private_users);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-private-ipc", c->private_ipc);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-protect-home", protect_home_to_string(c->protect_home));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-protect-system", protect_system_to_string(c->protect_system));
+ if (r < 0)
+ return r;
+
+ if (c->mount_apivfs_set) {
+ r = serialize_bool(f, "exec-context-mount-api-vfs", c->mount_apivfs);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-context-same-pgrp", c->same_pgrp);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-cpu-sched-reset-on-fork", c->cpu_sched_reset_on_fork);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool(f, "exec-context-ignore-sigpipe", c->ignore_sigpipe);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-memory-deny-write-execute", c->memory_deny_write_execute);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-restrict-realtime", c->restrict_realtime);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-restrict-suid-sgid", c->restrict_suid_sgid);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-keyring-mode", exec_keyring_mode_to_string(c->keyring_mode));
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-protect-hostname", c->protect_hostname);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-protect-proc", protect_proc_to_string(c->protect_proc));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-proc-subset", proc_subset_to_string(c->proc_subset));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
+ if (r < 0)
+ return r;
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ _cleanup_free_ char *key = NULL, *value = NULL;
+
+ key = strjoin("exec-context-directories-", exec_directory_type_to_string(dt));
+ if (!key)
+ return log_oom_debug();
+
+ if (asprintf(&value, "%04o", c->directories[dt].mode) < 0)
+ return log_oom_debug();
+
+ FOREACH_ARRAY(i, c->directories[dt].items, c->directories[dt].n_items) {
+ _cleanup_free_ char *path_escaped = NULL;
+
+ path_escaped = shell_escape(i->path, ":" WHITESPACE);
+ if (!path_escaped)
+ return log_oom_debug();
+
+ if (!strextend(&value, " ", path_escaped))
+ return log_oom_debug();
+
+ if (!strextend(&value, ":", yes_no(i->only_create)))
+ return log_oom_debug();
+
+ STRV_FOREACH(d, i->symlinks) {
+ _cleanup_free_ char *link_escaped = NULL;
+
+ link_escaped = shell_escape(*d, ":" WHITESPACE);
+ if (!link_escaped)
+ return log_oom_debug();
+
+ if (!strextend(&value, ":", link_escaped))
+ return log_oom_debug();
+ }
+ }
+
+ r = serialize_item(f, key, value);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_usec(f, "exec-context-timeout-clean-usec", c->timeout_clean_usec);
+ if (r < 0)
+ return r;
+
+ if (c->nice_set) {
+ r = serialize_item_format(f, "exec-context-nice", "%i", c->nice);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-context-working-directory-missing-ok", c->working_directory_missing_ok);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-working-directory-home", c->working_directory_home);
+ if (r < 0)
+ return r;
+
+ if (c->oom_score_adjust_set) {
+ r = serialize_item_format(f, "exec-context-oom-score-adjust", "%i", c->oom_score_adjust);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->coredump_filter_set) {
+ r = serialize_item_format(f, "exec-context-coredump-filter", "%"PRIx64, c->coredump_filter);
+ if (r < 0)
+ return r;
+ }
+
+ for (unsigned i = 0; i < RLIM_NLIMITS; i++) {
+ _cleanup_free_ char *key = NULL, *limit = NULL;
+
+ if (!c->rlimit[i])
+ continue;
+
+ key = strjoin("exec-context-limit-", rlimit_to_string(i));
+ if (!key)
+ return log_oom_debug();
+
+ r = rlimit_format(c->rlimit[i], &limit);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, key, limit);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->ioprio_set) {
+ r = serialize_item_format(f, "exec-context-ioprio", "%d", c->ioprio);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->cpu_sched_set) {
+ _cleanup_free_ char *policy_str = NULL;
+
+ r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-cpu-scheduling-policy", policy_str);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-cpu-scheduling-priority", "%i", c->cpu_sched_priority);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-cpu-scheduling-reset-on-fork", c->cpu_sched_reset_on_fork);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->cpu_set.set) {
+ _cleanup_free_ char *affinity = NULL;
+
+ affinity = cpu_set_to_range_string(&c->cpu_set);
+ if (!affinity)
+ return log_oom_debug();
+
+ r = serialize_item(f, "exec-context-cpu-affinity", affinity);
+ if (r < 0)
+ return r;
+ }
+
+ if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+ _cleanup_free_ char *nodes = NULL;
+
+ nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+ if (!nodes)
+ return log_oom_debug();
+
+ if (nodes) {
+ r = serialize_item(f, "exec-context-numa-mask", nodes);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item_format(f, "exec-context-numa-policy", "%d", c->numa_policy.type);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-context-cpu-affinity-from-numa", c->cpu_affinity_from_numa);
+ if (r < 0)
+ return r;
+
+ if (c->timer_slack_nsec != NSEC_INFINITY) {
+ r = serialize_item_format(f, "exec-context-timer-slack-nsec", NSEC_FMT, c->timer_slack_nsec);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-context-std-input", exec_input_to_string(c->std_input));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-std-output", exec_output_to_string(c->std_output));
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-std-error", exec_output_to_string(c->std_error));
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-stdio-as-fds", c->stdio_as_fds);
+ if (r < 0)
+ return r;
+
+ switch (c->std_input) {
+ case EXEC_INPUT_NAMED_FD:
+ r = serialize_item(f, "exec-context-std-input-fd-name", c->stdio_fdname[STDIN_FILENO]);
+ if (r < 0)
+ return r;
+ break;
+
+ case EXEC_INPUT_FILE:
+ r = serialize_item(f, "exec-context-std-input-file", c->stdio_file[STDIN_FILENO]);
+ if (r < 0)
+ return r;
+ break;
+
+ default:
+ break;
+ }
+
+ r = serialize_std_out_err(c, f, STDOUT_FILENO);
+ if (r < 0)
+ return r;
+
+ r = serialize_std_out_err(c, f, STDERR_FILENO);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_base64mem(f, "exec-context-stdin-data", c->stdin_data, c->stdin_data_size);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-tty-path", c->tty_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-tty-reset", c->tty_reset);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-tty-vhangup", c->tty_vhangup);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-tty-vt-disallocate", c->tty_vt_disallocate);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-tty-rows", "%u", c->tty_rows);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-tty-columns", "%u", c->tty_cols);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-syslog-priority", "%i", c->syslog_priority);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool(f, "exec-context-syslog-level-prefix", c->syslog_level_prefix);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-syslog-identifier", c->syslog_identifier);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-log-level-max", "%d", c->log_level_max);
+ if (r < 0)
+ return r;
+
+ if (c->log_ratelimit_interval_usec > 0) {
+ r = serialize_usec(f, "exec-context-log-ratelimit-interval-usec", c->log_ratelimit_interval_usec);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->log_ratelimit_burst > 0) {
+ r = serialize_item_format(f, "exec-context-log-ratelimit-burst", "%u", c->log_ratelimit_burst);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_string_set(f, "exec-context-log-filter-allowed-patterns", c->log_filter_allowed_patterns);
+ if (r < 0)
+ return r;
+
+ r = serialize_string_set(f, "exec-context-log-filter-denied-patterns", c->log_filter_denied_patterns);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) {
+ r = serialize_item(f, "exec-context-log-extra-fields", field->iov_base);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-context-log-namespace", c->log_namespace);
+ if (r < 0)
+ return r;
+
+ if (c->secure_bits != 0) {
+ r = serialize_item_format(f, "exec-context-secure-bits", "%d", c->secure_bits);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->capability_bounding_set != CAP_MASK_UNSET) {
+ r = serialize_item_format(f, "exec-context-capability-bounding-set", "%" PRIu64, c->capability_bounding_set);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->capability_ambient_set != 0) {
+ r = serialize_item_format(f, "exec-context-capability-ambient-set", "%" PRIu64, c->capability_ambient_set);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->user) {
+ r = serialize_item(f, "exec-context-user", c->user);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-context-group", c->group);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-dynamic-user", c->dynamic_user);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-supplementary-groups", c->supplementary_groups);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_tristate(f, "exec-context-set-login-environment", c->set_login_environment);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-pam-name", c->pam_name);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-read-write-paths", c->read_write_paths);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-read-only-paths", c->read_only_paths);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-inaccessible-paths", c->inaccessible_paths);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-exec-paths", c->exec_paths);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-no-exec-paths", c->no_exec_paths);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-context-exec-search-path", c->exec_search_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-context-mount-propagation-flag", "%lu", c->mount_propagation_flag);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts) {
+ _cleanup_free_ char *src_escaped = NULL, *dst_escaped = NULL;
+
+ src_escaped = shell_escape(mount->source, ":" WHITESPACE);
+ if (!src_escaped)
+ return log_oom_debug();
+
+ dst_escaped = shell_escape(mount->destination, ":" WHITESPACE);
+ if (!dst_escaped)
+ return log_oom_debug();
+
+ r = serialize_item_format(f,
+ mount->read_only ? "exec-context-bind-read-only-path" : "exec-context-bind-path",
+ "%s%s:%s:%s",
+ mount->ignore_enoent ? "-" : "",
+ src_escaped,
+ dst_escaped,
+ mount->recursive ? "rbind" : "norbind");
+ if (r < 0)
+ return r;
+ }
+
+ FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems) {
+ _cleanup_free_ char *escaped = NULL;
+
+ if (!isempty(tmpfs->options)) {
+ escaped = shell_escape(tmpfs->options, ":");
+ if (!escaped)
+ return log_oom_debug();
+ }
+
+ r = serialize_item_format(f, "exec-context-temporary-filesystems", "%s%s%s",
+ tmpfs->path,
+ isempty(escaped) ? "" : ":",
+ strempty(escaped));
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_item(f, "exec-context-utmp-id", c->utmp_id);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-utmp-mode", exec_utmp_mode_to_string(c->utmp_mode));
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-no-new-privileges", c->no_new_privileges);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-selinux-context-ignore", c->selinux_context_ignore);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-apparmor-profile-ignore", c->apparmor_profile_ignore);
+ if (r < 0)
+ return r;
+
+ r = serialize_bool_elide(f, "exec-context-smack-process-label-ignore", c->smack_process_label_ignore);
+ if (r < 0)
+ return r;
+
+ if (c->selinux_context) {
+ r = serialize_item_format(f, "exec-context-selinux-context",
+ "%s%s",
+ c->selinux_context_ignore ? "-" : "",
+ c->selinux_context);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->apparmor_profile) {
+ r = serialize_item_format(f, "exec-context-apparmor-profile",
+ "%s%s",
+ c->apparmor_profile_ignore ? "-" : "",
+ c->apparmor_profile);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->smack_process_label) {
+ r = serialize_item_format(f, "exec-context-smack-process-label",
+ "%s%s",
+ c->smack_process_label_ignore ? "-" : "",
+ c->smack_process_label);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->personality != PERSONALITY_INVALID) {
+ r = serialize_item(f, "exec-context-personality", personality_to_string(c->personality));
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-context-lock-personality", c->lock_personality);
+ if (r < 0)
+ return r;
+
+#if HAVE_SECCOMP
+ if (!hashmap_isempty(c->syscall_filter)) {
+ void *errno_num, *id;
+ HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_filter) {
+ r = serialize_item_format(f, "exec-context-syscall-filter", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!set_isempty(c->syscall_archs)) {
+ void *id;
+ SET_FOREACH(id, c->syscall_archs) {
+ r = serialize_item_format(f, "exec-context-syscall-archs", "%u", PTR_TO_UINT(id) - 1);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (c->syscall_errno > 0) {
+ r = serialize_item_format(f, "exec-context-syscall-errno", "%d", c->syscall_errno);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_bool_elide(f, "exec-context-syscall-allow-list", c->syscall_allow_list);
+ if (r < 0)
+ return r;
+
+ if (!hashmap_isempty(c->syscall_log)) {
+ void *errno_num, *id;
+ HASHMAP_FOREACH_KEY(errno_num, id, c->syscall_log) {
+ r = serialize_item_format(f, "exec-context-syscall-log", "%d %d", PTR_TO_INT(id) - 1, PTR_TO_INT(errno_num));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = serialize_bool_elide(f, "exec-context-syscall-log-allow-list", c->syscall_log_allow_list);
+ if (r < 0)
+ return r;
+#endif
+
+ if (c->restrict_namespaces != NAMESPACE_FLAGS_INITIAL) {
+ r = serialize_item_format(f, "exec-context-restrict-namespaces", "%lu", c->restrict_namespaces);
+ if (r < 0)
+ return r;
+ }
+
+#if HAVE_LIBBPF
+ if (exec_context_restrict_filesystems_set(c)) {
+ char *fs;
+ SET_FOREACH(fs, c->restrict_filesystems) {
+ r = serialize_item(f, "exec-context-restrict-filesystems", fs);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = serialize_bool_elide(f, "exec-context-restrict-filesystems-allow-list", c->restrict_filesystems_allow_list);
+ if (r < 0)
+ return r;
+#endif
+
+ if (!set_isempty(c->address_families)) {
+ void *afp;
+
+ SET_FOREACH(afp, c->address_families) {
+ int af = PTR_TO_INT(afp);
+
+ if (af <= 0 || af >= af_max())
+ continue;
+
+ r = serialize_item_format(f, "exec-context-address-families", "%d", af);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = serialize_bool_elide(f, "exec-context-address-families-allow-list", c->address_families_allow_list);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-network-namespace-path", c->network_namespace_path);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, "exec-context-ipc-namespace-path", c->ipc_namespace_path);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) {
+ _cleanup_free_ char *s = NULL, *source_escaped = NULL, *dest_escaped = NULL;
+
+ source_escaped = shell_escape(mount->source, WHITESPACE);
+ if (!source_escaped)
+ return log_oom_debug();
+
+ dest_escaped = shell_escape(mount->destination, WHITESPACE);
+ if (!dest_escaped)
+ return log_oom_debug();
+
+ s = strjoin(mount->ignore_enoent ? "-" : "",
+ source_escaped,
+ " ",
+ dest_escaped);
+ if (!s)
+ return log_oom_debug();
+
+ LIST_FOREACH(mount_options, o, mount->mount_options) {
+ _cleanup_free_ char *escaped = NULL;
+
+ if (isempty(o->options))
+ continue;
+
+ escaped = shell_escape(o->options, ":");
+ if (!escaped)
+ return log_oom_debug();
+
+ if (!strextend(&s,
+ " ",
+ partition_designator_to_string(o->partition_designator),
+ ":",
+ escaped))
+ return log_oom_debug();
+ }
+
+ r = serialize_item(f, "exec-context-mount-image", s);
+ if (r < 0)
+ return r;
+ }
+
+ FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) {
+ _cleanup_free_ char *s = NULL, *source_escaped = NULL;
+
+ source_escaped = shell_escape(mount->source, ":" WHITESPACE);
+ if (!source_escaped)
+ return log_oom_debug();
+
+ s = strjoin(mount->ignore_enoent ? "-" : "",
+ source_escaped);
+ if (!s)
+ return log_oom_debug();
+
+ LIST_FOREACH(mount_options, o, mount->mount_options) {
+ _cleanup_free_ char *escaped = NULL;
+
+ if (isempty(o->options))
+ continue;
+
+ escaped = shell_escape(o->options, ":");
+ if (!escaped)
+ return log_oom_debug();
+
+ if (!strextend(&s,
+ " ",
+ partition_designator_to_string(o->partition_designator),
+ ":",
+ escaped))
+ return log_oom_debug();
+ }
+
+ r = serialize_item(f, "exec-context-extension-image", s);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_strv(f, "exec-context-extension-directories", c->extension_directories);
+ if (r < 0)
+ return r;
+
+ ExecSetCredential *sc;
+ HASHMAP_FOREACH(sc, c->set_credentials) {
+ _cleanup_free_ char *data = NULL;
+
+ if (base64mem(sc->data, sc->size, &data) < 0)
+ return log_oom_debug();
+
+ r = serialize_item_format(f, "exec-context-set-credentials", "%s %s %s", sc->id, yes_no(sc->encrypted), data);
+ if (r < 0)
+ return r;
+ }
+
+ ExecLoadCredential *lc;
+ HASHMAP_FOREACH(lc, c->load_credentials) {
+ r = serialize_item_format(f, "exec-context-load-credentials", "%s %s %s", lc->id, yes_no(lc->encrypted), lc->path);
+ if (r < 0)
+ return r;
+ }
+
+ if (!set_isempty(c->import_credentials)) {
+ char *ic;
+ SET_FOREACH(ic, c->import_credentials) {
+ r = serialize_item(f, "exec-context-import-credentials", ic);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = serialize_image_policy(f, "exec-context-root-image-policy", c->root_image_policy);
+ if (r < 0)
+ return r;
+
+ r = serialize_image_policy(f, "exec-context-mount-image-policy", c->mount_image_policy);
+ if (r < 0)
+ return r;
+
+ r = serialize_image_policy(f, "exec-context-extension-image-policy", c->extension_image_policy);
+ if (r < 0)
+ return r;
+
+ fputc('\n', f); /* End marker */
+
+ return 0;
+}
+
+static int exec_context_deserialize(ExecContext *c, FILE *f) {
+ int r;
+
+ assert(f);
+
+ if (!c)
+ return 0;
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "exec-context-environment="))) {
+ r = deserialize_strv(val, &c->environment);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-environment-files="))) {
+ r = deserialize_strv(val, &c->environment_files);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-pass-environment="))) {
+ r = deserialize_strv(val, &c->pass_environment);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-unset-environment="))) {
+ r = deserialize_strv(val, &c->unset_environment);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-working-directory="))) {
+ r = free_and_strdup(&c->working_directory, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-directory="))) {
+ r = free_and_strdup(&c->root_directory, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-image="))) {
+ r = free_and_strdup(&c->root_image, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-image-options="))) {
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *mount_options = NULL, *partition = NULL;
+ PartitionDesignator partition_designator;
+ MountOptions *o = NULL;
+ const char *p;
+
+ r = extract_first_word(&val, &word, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = word;
+ r = extract_many_words(&p, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0)
+ return -EINVAL;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom_debug();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options),
+ };
+ LIST_APPEND(mount_options, c->root_image_options, o);
+ }
+ } else if ((val = startswith(l, "exec-context-root-verity="))) {
+ r = free_and_strdup(&c->root_verity, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-hash-path="))) {
+ r = free_and_strdup(&c->root_hash_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-hash-sig-path="))) {
+ r = free_and_strdup(&c->root_hash_sig_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-hash="))) {
+ c->root_hash = mfree(c->root_hash);
+ r = unhexmem(val, strlen(val), &c->root_hash, &c->root_hash_size);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-hash-sig="))) {
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ r= unbase64mem(val, strlen(val), &c->root_hash_sig, &c->root_hash_sig_size);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-ephemeral="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->root_ephemeral = r;
+ } else if ((val = startswith(l, "exec-context-umask="))) {
+ r = parse_mode(val, &c->umask);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-private-non-blocking="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->non_blocking = r;
+ } else if ((val = startswith(l, "exec-context-private-mounts="))) {
+ r = safe_atoi(val, &c->private_mounts);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-memory-ksm="))) {
+ r = safe_atoi(val, &c->memory_ksm);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-private-tmp="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->private_tmp = r;
+ } else if ((val = startswith(l, "exec-context-private-devices="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->private_devices = r;
+ } else if ((val = startswith(l, "exec-context-protect-kernel-tunables="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_kernel_tunables = r;
+ } else if ((val = startswith(l, "exec-context-protect-kernel-modules="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_kernel_modules = r;
+ } else if ((val = startswith(l, "exec-context-protect-kernel-logs="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_kernel_logs = r;
+ } else if ((val = startswith(l, "exec-context-protect-clock="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_clock = r;
+ } else if ((val = startswith(l, "exec-context-protect-control-groups="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_control_groups = r;
+ } else if ((val = startswith(l, "exec-context-private-network="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->private_network = r;
+ } else if ((val = startswith(l, "exec-context-private-users="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->private_users = r;
+ } else if ((val = startswith(l, "exec-context-private-ipc="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->private_ipc = r;
+ } else if ((val = startswith(l, "exec-context-remove-ipc="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->remove_ipc = r;
+ } else if ((val = startswith(l, "exec-context-protect-home="))) {
+ c->protect_home = protect_home_from_string(val);
+ if (c->protect_home < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-protect-system="))) {
+ c->protect_system = protect_system_from_string(val);
+ if (c->protect_system < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-mount-api-vfs="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->mount_apivfs = r;
+ c->mount_apivfs_set = true;
+ } else if ((val = startswith(l, "exec-context-same-pgrp="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->same_pgrp = r;
+ } else if ((val = startswith(l, "exec-context-cpu-sched-reset-on-fork="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->cpu_sched_reset_on_fork = r;
+ } else if ((val = startswith(l, "exec-context-non-blocking="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->non_blocking = r;
+ } else if ((val = startswith(l, "exec-context-ignore-sigpipe="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->ignore_sigpipe = r;
+ } else if ((val = startswith(l, "exec-context-memory-deny-write-execute="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->memory_deny_write_execute = r;
+ } else if ((val = startswith(l, "exec-context-restrict-realtime="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->restrict_realtime = r;
+ } else if ((val = startswith(l, "exec-context-restrict-suid-sgid="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->restrict_suid_sgid = r;
+ } else if ((val = startswith(l, "exec-context-keyring-mode="))) {
+ c->keyring_mode = exec_keyring_mode_from_string(val);
+ if (c->keyring_mode < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-protect-hostname="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->protect_hostname = r;
+ } else if ((val = startswith(l, "exec-context-protect-proc="))) {
+ c->protect_proc = protect_proc_from_string(val);
+ if (c->protect_proc < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-proc-subset="))) {
+ c->proc_subset = proc_subset_from_string(val);
+ if (c->proc_subset < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
+ c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
+ if (c->runtime_directory_preserve_mode < 0)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-directories-"))) {
+ _cleanup_free_ char *type = NULL, *mode = NULL;
+ ExecDirectoryType dt;
+
+ r = extract_many_words(&val, "= ", 0, &type, &mode, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0 || !mode)
+ return -EINVAL;
+
+ dt = exec_directory_type_from_string(type);
+ if (dt < 0)
+ return -EINVAL;
+
+ r = parse_mode(mode, &c->directories[dt].mode);
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ char *tuple = NULL, *path = NULL, *only_create = NULL;
+ const char *p;
+
+ /* Use EXTRACT_UNESCAPE_RELAX here, as we unescape the colons in subsequent calls */
+ r = extract_first_word(&val, &tuple, WHITESPACE, EXTRACT_UNESCAPE_SEPARATORS|EXTRACT_UNESCAPE_RELAX);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = tuple;
+ r = extract_many_words(&p, ":", EXTRACT_UNESCAPE_SEPARATORS, &path, &only_create, NULL);
+ if (r < 0)
+ return r;
+ if (r < 2)
+ continue;
+
+ r = exec_directory_add(&c->directories[dt], path, NULL);
+ if (r < 0)
+ return r;
+
+ r = parse_boolean(only_create);
+ if (r < 0)
+ return r;
+ c->directories[dt].items[c->directories[dt].n_items - 1].only_create = r;
+
+ if (isempty(p))
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *link = NULL;
+
+ r = extract_first_word(&p, &link, ":", EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = strv_consume(&c->directories[dt].items[c->directories[dt].n_items - 1].symlinks, TAKE_PTR(link));
+ if (r < 0)
+ return r;
+ }
+ }
+ } else if ((val = startswith(l, "exec-context-timeout-clean-usec="))) {
+ r = deserialize_usec(val, &c->timeout_clean_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-nice="))) {
+ r = safe_atoi(val, &c->nice);
+ if (r < 0)
+ return r;
+ c->nice_set = true;
+ } else if ((val = startswith(l, "exec-context-working-directory-missing-ok="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->working_directory_missing_ok = r;
+ } else if ((val = startswith(l, "exec-context-working-directory-home="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->working_directory_home = r;
+ } else if ((val = startswith(l, "exec-context-oom-score-adjust="))) {
+ r = safe_atoi(val, &c->oom_score_adjust);
+ if (r < 0)
+ return r;
+ c->oom_score_adjust_set = true;
+ } else if ((val = startswith(l, "exec-context-coredump-filter="))) {
+ r = safe_atoux64(val, &c->coredump_filter);
+ if (r < 0)
+ return r;
+ c->coredump_filter_set = true;
+ } else if ((val = startswith(l, "exec-context-limit-"))) {
+ _cleanup_free_ struct rlimit *rlimit = NULL;
+ _cleanup_free_ char *limit = NULL;
+ int type;
+
+ r = extract_first_word(&val, &limit, "=", 0);
+ if (r < 0)
+ return r;
+ if (r == 0 || !val)
+ return -EINVAL;
+
+ type = rlimit_from_string(limit);
+ if (type < 0)
+ return -EINVAL;
+
+ if (!c->rlimit[type]) {
+ rlimit = new0(struct rlimit, 1);
+ if (!rlimit)
+ return log_oom_debug();
+
+ r = rlimit_parse(type, val, rlimit);
+ if (r < 0)
+ return r;
+
+ c->rlimit[type] = TAKE_PTR(rlimit);
+ } else {
+ r = rlimit_parse(type, val, c->rlimit[type]);
+ if (r < 0)
+ return r;
+ }
+ } else if ((val = startswith(l, "exec-context-ioprio="))) {
+ r = safe_atoi(val, &c->ioprio);
+ if (r < 0)
+ return r;
+ c->ioprio_set = true;
+ } else if ((val = startswith(l, "exec-context-cpu-scheduling-policy="))) {
+ c->cpu_sched_policy = sched_policy_from_string(val);
+ if (c->cpu_sched_policy < 0)
+ return -EINVAL;
+ c->cpu_sched_set = true;
+ } else if ((val = startswith(l, "exec-context-cpu-scheduling-priority="))) {
+ r = safe_atoi(val, &c->cpu_sched_priority);
+ if (r < 0)
+ return r;
+ c->cpu_sched_set = true;
+ } else if ((val = startswith(l, "exec-context-cpu-scheduling-reset-on-fork="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->cpu_sched_reset_on_fork = r;
+ c->cpu_sched_set = true;
+ } else if ((val = startswith(l, "exec-context-cpu-affinity="))) {
+ if (c->cpu_set.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set(val, &c->cpu_set);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-numa-mask="))) {
+ if (c->numa_policy.nodes.set)
+ return -EINVAL; /* duplicated */
+
+ r = parse_cpu_set(val, &c->numa_policy.nodes);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-numa-policy="))) {
+ r = safe_atoi(val, &c->numa_policy.type);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-cpu-affinity-from-numa="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->cpu_affinity_from_numa = r;
+ } else if ((val = startswith(l, "exec-context-timer-slack-nsec="))) {
+ r = deserialize_usec(val, (usec_t *)&c->timer_slack_nsec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-input="))) {
+ c->std_input = exec_input_from_string(val);
+ if (c->std_input < 0)
+ return c->std_input;
+ } else if ((val = startswith(l, "exec-context-std-output="))) {
+ c->std_output = exec_output_from_string(val);
+ if (c->std_output < 0)
+ return c->std_output;
+ } else if ((val = startswith(l, "exec-context-std-error="))) {
+ c->std_error = exec_output_from_string(val);
+ if (c->std_error < 0)
+ return c->std_error;
+ } else if ((val = startswith(l, "exec-context-stdio-as-fds="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->stdio_as_fds = r;
+ } else if ((val = startswith(l, "exec-context-std-input-fd-name="))) {
+ r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-output-fd-name="))) {
+ r = free_and_strdup(&c->stdio_fdname[STDOUT_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-error-fd-name="))) {
+ r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-input-file="))) {
+ r = free_and_strdup(&c->stdio_file[STDIN_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-output-file="))) {
+ r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-output-file-append="))) {
+ r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-output-file-truncate="))) {
+ r = free_and_strdup(&c->stdio_file[STDOUT_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-error-file="))) {
+ r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-error-file-append="))) {
+ r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-std-error-file-truncate="))) {
+ r = free_and_strdup(&c->stdio_file[STDERR_FILENO], val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-stdin-data="))) {
+ if (c->stdin_data)
+ return -EINVAL; /* duplicated */
+
+ r = unbase64mem(val, strlen(val), &c->stdin_data, &c->stdin_data_size);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-tty-path="))) {
+ r = free_and_strdup(&c->tty_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-tty-reset="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->tty_reset = r;
+ } else if ((val = startswith(l, "exec-context-tty-vhangup="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->tty_vhangup = r;
+ } else if ((val = startswith(l, "exec-context-tty-vt-disallocate="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->tty_vt_disallocate = r;
+ } else if ((val = startswith(l, "exec-context-tty-rows="))) {
+ r = safe_atou(val, &c->tty_rows);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-tty-columns="))) {
+ r = safe_atou(val, &c->tty_cols);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syslog-priority="))) {
+ r = safe_atoi(val, &c->syslog_priority);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syslog-level-prefix="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->syslog_level_prefix = r;
+ } else if ((val = startswith(l, "exec-context-syslog-identifier="))) {
+ r = free_and_strdup(&c->syslog_identifier, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-level-max="))) {
+ r = safe_atoi(val, &c->log_level_max);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-ratelimit-interval-usec="))) {
+ r = deserialize_usec(val, &c->log_ratelimit_interval_usec);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-ratelimit-burst="))) {
+ r = safe_atou(val, &c->log_ratelimit_burst);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-filter-allowed-patterns="))) {
+ r = set_put_strdup(&c->log_filter_allowed_patterns, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-filter-denied-patterns="))) {
+ r = set_put_strdup(&c->log_filter_denied_patterns, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-log-extra-fields="))) {
+ if (!GREEDY_REALLOC(c->log_extra_fields, c->n_log_extra_fields + 1))
+ return log_oom_debug();
+
+ c->log_extra_fields[c->n_log_extra_fields++].iov_base = strdup(val);
+ if (!c->log_extra_fields[c->n_log_extra_fields-1].iov_base)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-log-namespace="))) {
+ r = free_and_strdup(&c->log_namespace, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-secure-bits="))) {
+ r = safe_atoi(val, &c->secure_bits);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-capability-bounding-set="))) {
+ r = safe_atou64(val, &c->capability_bounding_set);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-capability-ambient-set="))) {
+ r = safe_atou64(val, &c->capability_ambient_set);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-user="))) {
+ r = free_and_strdup(&c->user, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-group="))) {
+ r = free_and_strdup(&c->group, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-dynamic-user="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->dynamic_user = r;
+ } else if ((val = startswith(l, "exec-context-supplementary-groups="))) {
+ r = deserialize_strv(val, &c->supplementary_groups);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-set-login-environment="))) {
+ r = safe_atoi(val, &c->set_login_environment);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-pam-name="))) {
+ r = free_and_strdup(&c->pam_name, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-read-write-paths="))) {
+ r = deserialize_strv(val, &c->read_write_paths);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-read-only-paths="))) {
+ r = deserialize_strv(val, &c->read_only_paths);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-inaccessible-paths="))) {
+ r = deserialize_strv(val, &c->inaccessible_paths);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-exec-paths="))) {
+ r = deserialize_strv(val, &c->exec_paths);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-no-exec-paths="))) {
+ r = deserialize_strv(val, &c->no_exec_paths);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-exec-search-path="))) {
+ r = deserialize_strv(val, &c->exec_search_path);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-mount-propagation-flag="))) {
+ r = safe_atolu(val, &c->mount_propagation_flag);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-bind-read-only-path="))) {
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ bool rbind = true, ignore_enoent = false;
+ char *s = NULL, *d = NULL;
+
+ r = extract_first_word(&val,
+ &source,
+ ":" WHITESPACE,
+ EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ s = source;
+ if (s[0] == '-') {
+ ignore_enoent = true;
+ s++;
+ }
+
+ if (val && val[-1] == ':') {
+ r = extract_first_word(&val,
+ &destination,
+ ":" WHITESPACE,
+ EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ d = destination;
+
+ if (val && val[-1] == ':') {
+ _cleanup_free_ char *options = NULL;
+
+ r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return -r;
+
+ if (isempty(options) || streq(options, "rbind"))
+ rbind = true;
+ else if (streq(options, "norbind"))
+ rbind = false;
+ else
+ continue;
+ }
+ } else
+ d = s;
+
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = true,
+ .recursive = rbind,
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-bind-path="))) {
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ bool rbind = true, ignore_enoent = false;
+ char *s = NULL, *d = NULL;
+
+ r = extract_first_word(&val,
+ &source,
+ ":" WHITESPACE,
+ EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ s = source;
+ if (s[0] == '-') {
+ ignore_enoent = true;
+ s++;
+ }
+
+ if (val && val[-1] == ':') {
+ r = extract_first_word(&val,
+ &destination,
+ ":" WHITESPACE,
+ EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ d = destination;
+
+ if (val && val[-1] == ':') {
+ _cleanup_free_ char *options = NULL;
+
+ r = extract_first_word(&val, &options, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return -r;
+
+ if (isempty(options) || streq(options, "rbind"))
+ rbind = true;
+ else if (streq(options, "norbind"))
+ rbind = false;
+ else
+ continue;
+ }
+ } else
+ d = s;
+
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = false,
+ .recursive = rbind,
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-temporary-filesystems="))) {
+ _cleanup_free_ char *path = NULL, *options = NULL;
+
+ r = extract_many_words(&val, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &path, &options, NULL);
+ if (r < 0)
+ return r;
+ if (r < 1)
+ continue;
+
+ r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, path, options);
+ if (r < 0)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-utmp-id="))) {
+ r = free_and_strdup(&c->utmp_id, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-utmp-mode="))) {
+ c->utmp_mode = exec_utmp_mode_from_string(val);
+ if (c->utmp_mode < 0)
+ return c->utmp_mode;
+ } else if ((val = startswith(l, "exec-context-no-new-privileges="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->no_new_privileges = r;
+ } else if ((val = startswith(l, "exec-context-selinux-context-ignore="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->selinux_context_ignore = r;
+ } else if ((val = startswith(l, "exec-context-apparmor-profile-ignore="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->apparmor_profile_ignore = r;
+ } else if ((val = startswith(l, "exec-context-smack-process-label-ignore="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->smack_process_label_ignore = r;
+ } else if ((val = startswith(l, "exec-context-selinux-context="))) {
+ if (val[0] == '-') {
+ c->selinux_context_ignore = true;
+ val++;
+ }
+
+ r = free_and_strdup(&c->selinux_context, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-apparmor-profile="))) {
+ if (val[0] == '-') {
+ c->apparmor_profile_ignore = true;
+ val++;
+ }
+
+ r = free_and_strdup(&c->apparmor_profile, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-smack-process-label="))) {
+ if (val[0] == '-') {
+ c->smack_process_label_ignore = true;
+ val++;
+ }
+
+ r = free_and_strdup(&c->smack_process_label, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-personality="))) {
+ c->personality = personality_from_string(val);
+ if (c->personality == PERSONALITY_INVALID)
+ return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-lock-personality="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->lock_personality = r;
+#if HAVE_SECCOMP
+ } else if ((val = startswith(l, "exec-context-syscall-filter="))) {
+ _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
+ int id, errno_num;
+
+ r = extract_many_words(&val, NULL, 0, &s_id, &s_errno_num, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ continue;
+
+ r = safe_atoi(s_id, &id);
+ if (r < 0)
+ return r;
+
+ r = safe_atoi(s_errno_num, &errno_num);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_put(&c->syscall_filter, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syscall-archs="))) {
+ unsigned int id;
+
+ r = safe_atou(val, &id);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_put(&c->syscall_archs, NULL, UINT_TO_PTR(id + 1));
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syscall-errno="))) {
+ r = safe_atoi(val, &c->syscall_errno);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syscall-allow-list="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->syscall_allow_list = r;
+ } else if ((val = startswith(l, "exec-context-syscall-log="))) {
+ _cleanup_free_ char *s_id = NULL, *s_errno_num = NULL;
+ int id, errno_num;
+
+ r = extract_many_words(&val, " ", 0, &s_id, &s_errno_num, NULL);
+ if (r < 0)
+ return r;
+ if (r != 2)
+ continue;
+
+ r = safe_atoi(s_id, &id);
+ if (r < 0)
+ return r;
+
+ r = safe_atoi(s_errno_num, &errno_num);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_put(&c->syscall_log, NULL, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-syscall-log-allow-list="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->syscall_log_allow_list = r;
+#endif
+ } else if ((val = startswith(l, "exec-context-restrict-namespaces="))) {
+ r = safe_atolu(val, &c->restrict_namespaces);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-restrict-filesystems="))) {
+ r = set_ensure_allocated(&c->restrict_filesystems, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = set_put_strdup(&c->restrict_filesystems, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-restrict-filesystems-allow-list="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->restrict_filesystems_allow_list = r;
+ } else if ((val = startswith(l, "exec-context-address-families="))) {
+ int af;
+
+ r = safe_atoi(val, &af);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_put(&c->address_families, NULL, INT_TO_PTR(af));
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-address-families-allow-list="))) {
+ r = parse_boolean(val);
+ if (r < 0)
+ return r;
+ c->address_families_allow_list = r;
+ } else if ((val = startswith(l, "exec-context-network-namespace-path="))) {
+ r = free_and_strdup(&c->network_namespace_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-ipc-namespace-path="))) {
+ r = free_and_strdup(&c->ipc_namespace_path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-mount-image="))) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ bool permissive = false;
+ char *s;
+
+ r = extract_many_words(&val,
+ NULL,
+ EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+ &source,
+ &destination,
+ NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ s = source;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ if (isempty(destination))
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL;
+ PartitionDesignator partition_designator;
+ MountOptions *o = NULL;
+ const char *p;
+
+ r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = tuple;
+ r = extract_many_words(&p,
+ ":",
+ EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+ &partition,
+ &opts,
+ NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+ if (r == 1) {
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom_debug();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(partition),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ continue;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0)
+ continue;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom_debug();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(opts),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->mount_images, &c->n_mount_images,
+ &(MountImage) {
+ .source = s,
+ .destination = destination,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_DISCRETE,
+ });
+ if (r < 0)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-extension-image="))) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *source = NULL;
+ bool permissive = false;
+ char *s;
+
+ r = extract_first_word(&val,
+ &source,
+ NULL,
+ EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ s = source;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *tuple = NULL, *partition = NULL, *opts = NULL;
+ PartitionDesignator partition_designator;
+ MountOptions *o = NULL;
+ const char *p;
+
+ r = extract_first_word(&val, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ p = tuple;
+ r = extract_many_words(&p,
+ ":",
+ EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS,
+ &partition,
+ &opts,
+ NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+ if (r == 1) {
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom_debug();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(partition),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ continue;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0)
+ continue;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom_debug();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(opts),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->extension_images, &c->n_extension_images,
+ &(MountImage) {
+ .source = s,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_EXTENSION,
+ });
+ if (r < 0)
+ return log_oom_debug();
+ } else if ((val = startswith(l, "exec-context-extension-directories="))) {
+ r = deserialize_strv(val, &c->extension_directories);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-set-credentials="))) {
+ _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+ _cleanup_free_ char *id = NULL, *encrypted = NULL, *data = NULL;
+
+ r = extract_many_words(&val, " ", 0, &id, &encrypted, &data, NULL);
+ if (r < 0)
+ return r;
+ if (r != 3)
+ return -EINVAL;
+
+ r = parse_boolean(encrypted);
+ if (r < 0)
+ return r;
+
+ sc = new(ExecSetCredential, 1);
+ if (!sc)
+ return -ENOMEM;
+
+ *sc = (ExecSetCredential) {
+ .id = TAKE_PTR(id),
+ .encrypted = r,
+ };
+
+ r = unbase64mem(data, strlen(data), &sc->data, &sc->size);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(sc);
+ } else if ((val = startswith(l, "exec-context-load-credentials="))) {
+ _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+ _cleanup_free_ char *id = NULL, *encrypted = NULL, *path = NULL;
+
+ r = extract_many_words(&val, " ", 0, &id, &encrypted, &path, NULL);
+ if (r < 0)
+ return r;
+ if (r != 3)
+ return -EINVAL;
+
+ r = parse_boolean(encrypted);
+ if (r < 0)
+ return r;
+
+ lc = new(ExecLoadCredential, 1);
+ if (!lc)
+ return -ENOMEM;
+
+ *lc = (ExecLoadCredential) {
+ .id = TAKE_PTR(id),
+ .path = TAKE_PTR(path),
+ .encrypted = r,
+ };
+
+ r = hashmap_ensure_put(&c->load_credentials, &exec_load_credential_hash_ops, lc->id, lc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(lc);
+ } else if ((val = startswith(l, "exec-context-import-credentials="))) {
+ r = set_ensure_allocated(&c->import_credentials, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = set_put_strdup(&c->import_credentials, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-root-image-policy="))) {
+ if (c->root_image_policy)
+ return -EINVAL; /* duplicated */
+
+ r = image_policy_from_string(val, &c->root_image_policy);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-mount-image-policy="))) {
+ if (c->mount_image_policy)
+ return -EINVAL; /* duplicated */
+
+ r = image_policy_from_string(val, &c->mount_image_policy);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-context-extension-image-policy="))) {
+ if (c->extension_image_policy)
+ return -EINVAL; /* duplicated */
+
+ r = image_policy_from_string(val, &c->extension_image_policy);
+ if (r < 0)
+ return r;
+ } else
+ log_warning("Failed to parse serialized line, ignoring: %s", l);
+ }
+
+ return 0;
+}
+
+static int exec_command_serialize(const ExecCommand *c, FILE *f) {
+ int r;
+
+ assert(c);
+ assert(f);
+
+ r = serialize_item(f, "exec-command-path", c->path);
+ if (r < 0)
+ return r;
+
+ r = serialize_strv(f, "exec-command-argv", c->argv);
+ if (r < 0)
+ return r;
+
+ r = serialize_item_format(f, "exec-command-flags", "%d", (int) c->flags);
+ if (r < 0)
+ return r;
+
+ fputc('\n', f); /* End marker */
+
+ return 0;
+}
+
+static int exec_command_deserialize(ExecCommand *c, FILE *f) {
+ int r;
+
+ assert(c);
+ assert(f);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "exec-command-path="))) {
+ r = free_and_strdup(&c->path, val);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-command-argv="))) {
+ r = deserialize_strv(val, &c->argv);
+ if (r < 0)
+ return r;
+ } else if ((val = startswith(l, "exec-command-flags="))) {
+ r = safe_atoi(val, &c->flags);
+ if (r < 0)
+ return r;
+ } else
+ log_warning("Failed to parse serialized line, ignoring: %s", l);
+
+ }
+
+ return 0;
+}
+
+int exec_serialize_invocation(
+ FILE *f,
+ FDSet *fds,
+ const ExecContext *ctx,
+ const ExecCommand *cmd,
+ const ExecParameters *p,
+ const ExecRuntime *rt,
+ const CGroupContext *cg) {
+
+ int r;
+
+ assert(f);
+ assert(fds);
+
+ r = exec_context_serialize(ctx, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to serialize context: %m");
+
+ r = exec_command_serialize(cmd, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to serialize command: %m");
+
+ r = exec_parameters_serialize(p, ctx, f, fds);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to serialize parameters: %m");
+
+ r = exec_runtime_serialize(rt, f, fds);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to serialize runtime: %m");
+
+ r = exec_cgroup_context_serialize(cg, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to serialize cgroup context: %m");
+
+ return 0;
+}
+
+int exec_deserialize_invocation(
+ FILE *f,
+ FDSet *fds,
+ ExecContext *ctx,
+ ExecCommand *cmd,
+ ExecParameters *p,
+ ExecRuntime *rt,
+ CGroupContext *cg) {
+
+ int r;
+
+ assert(f);
+ assert(fds);
+
+ r = exec_context_deserialize(ctx, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to deserialize context: %m");
+
+ r = exec_command_deserialize(cmd, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to deserialize command: %m");
+
+ r = exec_parameters_deserialize(p, f, fds);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to deserialize parameters: %m");
+
+ r = exec_runtime_deserialize(rt, f, fds);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to deserialize runtime: %m");
+
+ r = exec_cgroup_context_deserialize(cg, f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to deserialize cgroup context: %m");
+
+ return 0;
+}
diff --git a/src/core/execute-serialize.h b/src/core/execute-serialize.h
new file mode 100644
index 0000000..89c8e09
--- /dev/null
+++ b/src/core/execute-serialize.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "execute.h"
+
+/* These functions serialize/deserialize for invocation purposes (i.e.: serialized object is passed to a
+ * child process) rather than to save state across reload/reexec. */
+
+int exec_serialize_invocation(FILE *f,
+ FDSet *fds,
+ const ExecContext *ctx,
+ const ExecCommand *cmd,
+ const ExecParameters *p,
+ const ExecRuntime *rt,
+ const CGroupContext *cg);
+
+int exec_deserialize_invocation(FILE *f,
+ FDSet *fds,
+ ExecContext *ctx,
+ ExecCommand *cmd,
+ ExecParameters *p,
+ ExecRuntime *rt,
+ CGroupContext *cg);
diff --git a/src/core/execute.c b/src/core/execute.c
new file mode 100644
index 0000000..8dbdfcf
--- /dev/null
+++ b/src/core/execute.c
@@ -0,0 +1,2742 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/personality.h>
+#include <sys/prctl.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <utmpx.h>
+
+#include <linux/fs.h> /* Must be included after <sys/mount.h> */
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "async.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "constants.h"
+#include "cpu-set-util.h"
+#include "dev-setup.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "execute-serialize.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "glob-util.h"
+#include "hexdecoct.h"
+#include "ioprio-util.h"
+#include "lock-util.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "memory-util.h"
+#include "missing_fs.h"
+#include "missing_prctl.h"
+#include "mkdir-label.h"
+#include "namespace.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "utmp-wtmp.h"
+
+static bool is_terminal_input(ExecInput i) {
+ return IN_SET(i,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL);
+}
+
+static bool is_terminal_output(ExecOutput o) {
+ return IN_SET(o,
+ EXEC_OUTPUT_TTY,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
+}
+
+const char *exec_context_tty_path(const ExecContext *context) {
+ assert(context);
+
+ if (context->stdio_as_fds)
+ return NULL;
+
+ if (context->tty_path)
+ return context->tty_path;
+
+ return "/dev/console";
+}
+
+static void exec_context_determine_tty_size(
+ const ExecContext *context,
+ const char *tty_path,
+ unsigned *ret_rows,
+ unsigned *ret_cols) {
+
+ unsigned rows, cols;
+
+ assert(context);
+ assert(ret_rows);
+ assert(ret_cols);
+
+ if (!tty_path)
+ tty_path = exec_context_tty_path(context);
+
+ rows = context->tty_rows;
+ cols = context->tty_cols;
+
+ if (tty_path && (rows == UINT_MAX || cols == UINT_MAX))
+ (void) proc_cmdline_tty_size(
+ tty_path,
+ rows == UINT_MAX ? &rows : NULL,
+ cols == UINT_MAX ? &cols : NULL);
+
+ *ret_rows = rows;
+ *ret_cols = cols;
+}
+
+int exec_context_apply_tty_size(
+ const ExecContext *context,
+ int tty_fd,
+ const char *tty_path) {
+
+ unsigned rows, cols;
+
+ exec_context_determine_tty_size(context, tty_path, &rows, &cols);
+
+ return terminal_set_size_fd(tty_fd, tty_path, rows, cols);
+ }
+
+void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
+ _cleanup_close_ int _fd = -EBADF, lock_fd = -EBADF;
+ int fd;
+
+ assert(context);
+
+ const char *path = exec_context_tty_path(context);
+
+ if (p && p->stdin_fd >= 0 && isatty(p->stdin_fd))
+ fd = p->stdin_fd;
+ else if (path && (context->tty_path || is_terminal_input(context->std_input) ||
+ is_terminal_output(context->std_output) || is_terminal_output(context->std_error))) {
+ fd = _fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
+ if (fd < 0)
+ return (void) log_debug_errno(fd, "Failed to open terminal '%s', ignoring: %m", path);
+ } else
+ return; /* nothing to do */
+
+ /* Take a synchronization lock for the duration of the setup that we do here.
+ * systemd-vconsole-setup.service also takes the lock to avoid being interrupted. We open a new fd
+ * that will be closed automatically, and operate on it for convenience. */
+ lock_fd = lock_dev_console();
+ if (ERRNO_IS_NEG_PRIVILEGE(lock_fd))
+ log_debug_errno(lock_fd, "No privileges to lock /dev/console, proceeding without: %m");
+ else if (lock_fd < 0)
+ return (void) log_debug_errno(lock_fd, "Failed to lock /dev/console: %m");
+
+ if (context->tty_vhangup)
+ (void) terminal_vhangup_fd(fd);
+
+ if (context->tty_reset)
+ (void) reset_terminal_fd(fd, /* switch_to_text= */ true);
+
+ (void) exec_context_apply_tty_size(context, fd, path);
+
+ if (context->tty_vt_disallocate && path)
+ (void) vt_disallocate(path);
+}
+
+bool exec_needs_network_namespace(const ExecContext *context) {
+ assert(context);
+
+ return context->private_network || context->network_namespace_path;
+}
+
+static bool exec_needs_ephemeral(const ExecContext *context) {
+ return (context->root_image || context->root_directory) && context->root_ephemeral;
+}
+
+bool exec_needs_ipc_namespace(const ExecContext *context) {
+ assert(context);
+
+ return context->private_ipc || context->ipc_namespace_path;
+}
+
+bool exec_needs_mount_namespace(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const ExecRuntime *runtime) {
+
+ assert(context);
+
+ if (context->root_image)
+ return true;
+
+ if (!strv_isempty(context->read_write_paths) ||
+ !strv_isempty(context->read_only_paths) ||
+ !strv_isempty(context->inaccessible_paths) ||
+ !strv_isempty(context->exec_paths) ||
+ !strv_isempty(context->no_exec_paths))
+ return true;
+
+ if (context->n_bind_mounts > 0)
+ return true;
+
+ if (context->n_temporary_filesystems > 0)
+ return true;
+
+ if (context->n_mount_images > 0)
+ return true;
+
+ if (context->n_extension_images > 0)
+ return true;
+
+ if (!strv_isempty(context->extension_directories))
+ return true;
+
+ if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
+ return true;
+
+ if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
+ return true;
+
+ if (context->private_devices ||
+ context->private_mounts > 0 ||
+ (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
+ context->protect_system != PROTECT_SYSTEM_NO ||
+ context->protect_home != PROTECT_HOME_NO ||
+ context->protect_kernel_tunables ||
+ context->protect_kernel_modules ||
+ context->protect_kernel_logs ||
+ context->protect_control_groups ||
+ context->protect_proc != PROTECT_PROC_DEFAULT ||
+ context->proc_subset != PROC_SUBSET_ALL ||
+ exec_needs_ipc_namespace(context))
+ return true;
+
+ if (context->root_directory) {
+ if (exec_context_get_effective_mount_apivfs(context))
+ return true;
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (params && !params->prefix[t])
+ continue;
+
+ if (context->directories[t].n_items > 0)
+ return true;
+ }
+ }
+
+ if (context->dynamic_user &&
+ (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+ context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+ context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
+ return true;
+
+ if (context->log_namespace)
+ return true;
+
+ return false;
+}
+
+bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
+ assert(context);
+
+ if (!context->dynamic_user)
+ return false;
+
+ if (type == EXEC_DIRECTORY_CONFIGURATION)
+ return false;
+
+ if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
+ return false;
+
+ return true;
+}
+
+int exec_params_get_cgroup_path(
+ const ExecParameters *params,
+ const CGroupContext *c,
+ char **ret) {
+
+ const char *subgroup = NULL;
+ char *p;
+
+ assert(params);
+ assert(ret);
+
+ if (!params->cgroup_path)
+ return -EINVAL;
+
+ /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
+ * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
+ * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
+ * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
+ * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
+ * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
+ * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
+ * flag, which is only passed for the former statements, not for the latter. */
+
+ if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
+ if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+ subgroup = ".control";
+ else
+ subgroup = c->delegate_subgroup;
+ }
+
+ if (subgroup)
+ p = path_join(params->cgroup_path, subgroup);
+ else
+ p = strdup(params->cgroup_path);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return !!subgroup;
+}
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
+ assert(c);
+
+ return c->cpu_affinity_from_numa;
+}
+
+static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
+ assert(unit);
+ assert(msg);
+ assert(executable);
+
+ if (!DEBUG_LOGGING)
+ return;
+
+ _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
+
+ log_unit_struct(unit, LOG_DEBUG,
+ "EXECUTABLE=%s", executable,
+ LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
+ LOG_UNIT_INVOCATION_ID(unit));
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
+
+int exec_spawn(Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ pid_t *ret) {
+
+ char serialization_fd_number[DECIMAL_STR_MAX(int) + 1];
+ _cleanup_free_ char *subcgroup_path = NULL, *log_level = NULL, *executor_path = NULL;
+ _cleanup_fdset_free_ FDSet *fdset = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ pid_t pid;
+ int r;
+
+ assert(unit);
+ assert(unit->manager);
+ assert(unit->manager->executor_fd >= 0);
+ assert(command);
+ assert(context);
+ assert(ret);
+ assert(params);
+ assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
+ assert(!params->files_env); /* We fill this field, ensure it comes NULL-initialized to us */
+
+ LOG_CONTEXT_PUSH_UNIT(unit);
+
+ r = exec_context_load_environment(unit, context, &params->files_env);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
+
+ /* We won't know the real executable path until we create the mount namespace in the child, but we
+ want to log from the parent, so we use the possibly inaccurate path here. */
+ log_command_line(unit, "About to execute", command->path, command->argv);
+
+ if (params->cgroup_path) {
+ r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
+ if (r > 0) {
+ /* If there's a subcgroup, then let's create it here now (the main cgroup was already
+ * realized by the unit logic) */
+
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
+ }
+ }
+
+ /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
+ * child's memory.max, serialize all the state needed to start the unit, and pass it to the
+ * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
+ * and ensure all memory is shared. The child immediately execs the new binary so the delay should
+ * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
+ * target cgroup. */
+
+ r = open_serialization_file("sd-executor-state", &f);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to open serialization stream: %m");
+
+ fdset = fdset_new();
+ if (!fdset)
+ return log_oom();
+
+ r = exec_serialize_invocation(f, fdset, context, command, params, runtime, cgroup_context);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to serialize parameters: %m");
+
+ if (fseeko(f, 0, SEEK_SET) < 0)
+ return log_unit_error_errno(unit, errno, "Failed to reseek on serialization stream: %m");
+
+ r = fd_cloexec(fileno(f), false);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialization fd: %m");
+
+ r = fdset_cloexec(fdset, false);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to set O_CLOEXEC on serialized fds: %m");
+
+ r = log_level_to_string_alloc(log_get_max_level(), &log_level);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to convert log level to string: %m");
+
+ r = fd_get_path(unit->manager->executor_fd, &executor_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to get executor path from fd: %m");
+
+ xsprintf(serialization_fd_number, "%i", fileno(f));
+
+ /* The executor binary is pinned, to avoid compatibility problems during upgrades. */
+ r = posix_spawn_wrapper(
+ FORMAT_PROC_FD_PATH(unit->manager->executor_fd),
+ STRV_MAKE(executor_path,
+ "--deserialize", serialization_fd_number,
+ "--log-level", log_level,
+ "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
+ environ,
+ &pid);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
+
+ log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
+
+ /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
+ * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
+ * process will be killed too). */
+ if (subcgroup_path)
+ (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
+
+ exec_status_start(&command->exec_status, pid);
+
+ *ret = pid;
+ return 0;
+}
+
+void exec_context_init(ExecContext *c) {
+ assert(c);
+
+ /* When initializing a bool member to 'true', make sure to serialize in execute-serialize.c using
+ * serialize_bool() instead of serialize_bool_elide(). */
+
+ *c = (ExecContext) {
+ .umask = 0022,
+ .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
+ .cpu_sched_policy = SCHED_OTHER,
+ .syslog_priority = LOG_DAEMON|LOG_INFO,
+ .syslog_level_prefix = true,
+ .ignore_sigpipe = true,
+ .timer_slack_nsec = NSEC_INFINITY,
+ .personality = PERSONALITY_INVALID,
+ .timeout_clean_usec = USEC_INFINITY,
+ .capability_bounding_set = CAP_MASK_UNSET,
+ .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
+ .log_level_max = -1,
+#if HAVE_SECCOMP
+ .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
+#endif
+ .tty_rows = UINT_MAX,
+ .tty_cols = UINT_MAX,
+ .private_mounts = -1,
+ .memory_ksm = -1,
+ .set_login_environment = -1,
+ };
+
+ FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
+ d->mode = 0755;
+
+ numa_policy_reset(&c->numa_policy);
+
+ assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
+}
+
+void exec_context_done(ExecContext *c) {
+ assert(c);
+
+ c->environment = strv_free(c->environment);
+ c->environment_files = strv_free(c->environment_files);
+ c->pass_environment = strv_free(c->pass_environment);
+ c->unset_environment = strv_free(c->unset_environment);
+
+ rlimit_free_all(c->rlimit);
+
+ for (size_t l = 0; l < 3; l++) {
+ c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
+ c->stdio_file[l] = mfree(c->stdio_file[l]);
+ }
+
+ c->working_directory = mfree(c->working_directory);
+ c->root_directory = mfree(c->root_directory);
+ c->root_image = mfree(c->root_image);
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_verity = mfree(c->root_verity);
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+ c->extension_directories = strv_free(c->extension_directories);
+ c->tty_path = mfree(c->tty_path);
+ c->syslog_identifier = mfree(c->syslog_identifier);
+ c->user = mfree(c->user);
+ c->group = mfree(c->group);
+
+ c->supplementary_groups = strv_free(c->supplementary_groups);
+
+ c->pam_name = mfree(c->pam_name);
+
+ c->read_only_paths = strv_free(c->read_only_paths);
+ c->read_write_paths = strv_free(c->read_write_paths);
+ c->inaccessible_paths = strv_free(c->inaccessible_paths);
+ c->exec_paths = strv_free(c->exec_paths);
+ c->no_exec_paths = strv_free(c->no_exec_paths);
+ c->exec_search_path = strv_free(c->exec_search_path);
+
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+
+ cpu_set_reset(&c->cpu_set);
+ numa_policy_reset(&c->numa_policy);
+
+ c->utmp_id = mfree(c->utmp_id);
+ c->selinux_context = mfree(c->selinux_context);
+ c->apparmor_profile = mfree(c->apparmor_profile);
+ c->smack_process_label = mfree(c->smack_process_label);
+
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+ c->syscall_archs = set_free(c->syscall_archs);
+ c->address_families = set_free(c->address_families);
+
+ FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
+ exec_directory_done(d);
+
+ c->log_level_max = -1;
+
+ exec_context_free_log_extra_fields(c);
+ c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+ c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+
+ c->log_ratelimit_interval_usec = 0;
+ c->log_ratelimit_burst = 0;
+
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+
+ c->network_namespace_path = mfree(c->network_namespace_path);
+ c->ipc_namespace_path = mfree(c->ipc_namespace_path);
+
+ c->log_namespace = mfree(c->log_namespace);
+
+ c->load_credentials = hashmap_free(c->load_credentials);
+ c->set_credentials = hashmap_free(c->set_credentials);
+ c->import_credentials = set_free_free(c->import_credentials);
+
+ c->root_image_policy = image_policy_free(c->root_image_policy);
+ c->mount_image_policy = image_policy_free(c->mount_image_policy);
+ c->extension_image_policy = image_policy_free(c->extension_image_policy);
+}
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
+ assert(c);
+
+ if (!runtime_prefix)
+ return 0;
+
+ FOREACH_ARRAY(i, c->directories[EXEC_DIRECTORY_RUNTIME].items, c->directories[EXEC_DIRECTORY_RUNTIME].n_items) {
+ _cleanup_free_ char *p = NULL;
+
+ if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+ p = path_join(runtime_prefix, "private", i->path);
+ else
+ p = path_join(runtime_prefix, i->path);
+ if (!p)
+ return -ENOMEM;
+
+ /* We execute this synchronously, since we need to be sure this is gone when we start the
+ * service next. */
+ (void) rm_rf(p, REMOVE_ROOT);
+
+ STRV_FOREACH(symlink, i->symlinks) {
+ _cleanup_free_ char *symlink_abs = NULL;
+
+ if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
+ symlink_abs = path_join(runtime_prefix, "private", *symlink);
+ else
+ symlink_abs = path_join(runtime_prefix, *symlink);
+ if (!symlink_abs)
+ return -ENOMEM;
+
+ (void) unlink(symlink_abs);
+ }
+ }
+
+ return 0;
+}
+
+int exec_context_destroy_mount_ns_dir(Unit *u) {
+ _cleanup_free_ char *p = NULL;
+
+ if (!u || !MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+
+ p = path_join("/run/systemd/propagate/", u->id);
+ if (!p)
+ return -ENOMEM;
+
+ /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
+ if (rmdir(p) < 0 && errno != ENOENT)
+ log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
+
+ return 0;
+}
+
+void exec_command_done(ExecCommand *c) {
+ assert(c);
+
+ c->path = mfree(c->path);
+ c->argv = strv_free(c->argv);
+}
+
+void exec_command_done_array(ExecCommand *c, size_t n) {
+ FOREACH_ARRAY(i, c, n)
+ exec_command_done(i);
+}
+
+ExecCommand* exec_command_free_list(ExecCommand *c) {
+ ExecCommand *i;
+
+ while ((i = LIST_POP(command, c))) {
+ exec_command_done(i);
+ free(i);
+ }
+
+ return NULL;
+}
+
+void exec_command_free_array(ExecCommand **c, size_t n) {
+ FOREACH_ARRAY(i, c, n)
+ *i = exec_command_free_list(*i);
+}
+
+void exec_command_reset_status_array(ExecCommand *c, size_t n) {
+ FOREACH_ARRAY(i, c, n)
+ exec_status_reset(&i->exec_status);
+}
+
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
+ FOREACH_ARRAY(i, c, n)
+ LIST_FOREACH(command, z, *i)
+ exec_status_reset(&z->exec_status);
+}
+
+typedef struct InvalidEnvInfo {
+ const Unit *unit;
+ const char *path;
+} InvalidEnvInfo;
+
+static void invalid_env(const char *p, void *userdata) {
+ InvalidEnvInfo *info = userdata;
+
+ log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
+}
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index) {
+ assert(c);
+
+ switch (fd_index) {
+
+ case STDIN_FILENO:
+ if (c->std_input != EXEC_INPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
+
+ case STDOUT_FILENO:
+ if (c->std_output != EXEC_OUTPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
+
+ case STDERR_FILENO:
+ if (c->std_error != EXEC_OUTPUT_NAMED_FD)
+ return NULL;
+
+ return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
+
+ default:
+ return NULL;
+ }
+}
+
+static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
+ _cleanup_strv_free_ char **v = NULL;
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ STRV_FOREACH(i, c->environment_files) {
+ _cleanup_globfree_ glob_t pglob = {};
+ bool ignore = false;
+ char *fn = *i;
+
+ if (fn[0] == '-') {
+ ignore = true;
+ fn++;
+ }
+
+ if (!path_is_absolute(fn)) {
+ if (ignore)
+ continue;
+ return -EINVAL;
+ }
+
+ /* Filename supports globbing, take all matching files */
+ r = safe_glob(fn, 0, &pglob);
+ if (r < 0) {
+ if (ignore)
+ continue;
+ return r;
+ }
+
+ /* When we don't match anything, -ENOENT should be returned */
+ assert(pglob.gl_pathc > 0);
+
+ FOREACH_ARRAY(path, pglob.gl_pathv, pglob.gl_pathc) {
+ _cleanup_strv_free_ char **p = NULL;
+
+ r = load_env_file(NULL, *path, &p);
+ if (r < 0) {
+ if (ignore)
+ continue;
+ return r;
+ }
+
+ /* Log invalid environment variables with filename */
+ if (p) {
+ InvalidEnvInfo info = {
+ .unit = unit,
+ .path = *path,
+ };
+
+ p = strv_env_clean_with_callback(p, invalid_env, &info);
+ }
+
+ if (!v)
+ v = TAKE_PTR(p);
+ else {
+ char **m = strv_env_merge(v, p);
+ if (!m)
+ return -ENOMEM;
+
+ strv_free_and_replace(v, m);
+ }
+ }
+ }
+
+ *ret = TAKE_PTR(v);
+
+ return 0;
+}
+
+static bool tty_may_match_dev_console(const char *tty) {
+ _cleanup_free_ char *resolved = NULL;
+
+ if (!tty)
+ return true;
+
+ tty = skip_dev_prefix(tty);
+
+ /* trivial identity? */
+ if (streq(tty, "console"))
+ return true;
+
+ if (resolve_dev_console(&resolved) < 0)
+ return true; /* if we could not resolve, assume it may */
+
+ /* "tty0" means the active VC, so it may be the same sometimes */
+ return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
+}
+
+static bool exec_context_may_touch_tty(const ExecContext *ec) {
+ assert(ec);
+
+ return ec->tty_reset ||
+ ec->tty_vhangup ||
+ ec->tty_vt_disallocate ||
+ is_terminal_input(ec->std_input) ||
+ is_terminal_output(ec->std_output) ||
+ is_terminal_output(ec->std_error);
+}
+
+bool exec_context_may_touch_console(const ExecContext *ec) {
+
+ return exec_context_may_touch_tty(ec) &&
+ tty_may_match_dev_console(exec_context_tty_path(ec));
+}
+
+static void strv_fprintf(FILE *f, char **l) {
+ assert(f);
+
+ STRV_FOREACH(g, l)
+ fprintf(f, " %s", *g);
+}
+
+static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
+ assert(f);
+ assert(prefix);
+ assert(name);
+
+ if (!strv_isempty(strv)) {
+ fprintf(f, "%s%s:", prefix, name);
+ strv_fprintf(f, strv);
+ fputs("\n", f);
+ }
+}
+
+void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) {
+ assert(p);
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sRuntimeScope: %s\n"
+ "%sExecFlags: %u\n"
+ "%sSELinuxContextNetwork: %s\n"
+ "%sCgroupSupportedMask: %u\n"
+ "%sCgroupPath: %s\n"
+ "%sCrededentialsDirectory: %s\n"
+ "%sEncryptedCredentialsDirectory: %s\n"
+ "%sConfirmSpawn: %s\n"
+ "%sShallConfirmSpawn: %s\n"
+ "%sWatchdogUSec: " USEC_FMT "\n"
+ "%sNotifySocket: %s\n"
+ "%sFallbackSmackProcessLabel: %s\n",
+ prefix, runtime_scope_to_string(p->runtime_scope),
+ prefix, p->flags,
+ prefix, yes_no(p->selinux_context_net),
+ prefix, p->cgroup_supported,
+ prefix, p->cgroup_path,
+ prefix, strempty(p->received_credentials_directory),
+ prefix, strempty(p->received_encrypted_credentials_directory),
+ prefix, strempty(p->confirm_spawn),
+ prefix, yes_no(p->shall_confirm_spawn),
+ prefix, p->watchdog_usec,
+ prefix, strempty(p->notify_socket),
+ prefix, strempty(p->fallback_smack_process_label));
+
+ strv_dump(f, prefix, "FdNames", p->fd_names);
+ strv_dump(f, prefix, "Environment", p->environment);
+ strv_dump(f, prefix, "Prefix", p->prefix);
+
+ LIST_FOREACH(open_files, file, p->open_files)
+ fprintf(f, "%sOpenFile: %s %s", prefix, file->path, open_file_flags_to_string(file->flags));
+
+ strv_dump(f, prefix, "FilesEnv", p->files_env);
+}
+
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
+ int r;
+
+ assert(c);
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sUMask: %04o\n"
+ "%sWorkingDirectory: %s\n"
+ "%sRootDirectory: %s\n"
+ "%sRootEphemeral: %s\n"
+ "%sNonBlocking: %s\n"
+ "%sPrivateTmp: %s\n"
+ "%sPrivateDevices: %s\n"
+ "%sProtectKernelTunables: %s\n"
+ "%sProtectKernelModules: %s\n"
+ "%sProtectKernelLogs: %s\n"
+ "%sProtectClock: %s\n"
+ "%sProtectControlGroups: %s\n"
+ "%sPrivateNetwork: %s\n"
+ "%sPrivateUsers: %s\n"
+ "%sProtectHome: %s\n"
+ "%sProtectSystem: %s\n"
+ "%sMountAPIVFS: %s\n"
+ "%sIgnoreSIGPIPE: %s\n"
+ "%sMemoryDenyWriteExecute: %s\n"
+ "%sRestrictRealtime: %s\n"
+ "%sRestrictSUIDSGID: %s\n"
+ "%sKeyringMode: %s\n"
+ "%sProtectHostname: %s\n"
+ "%sProtectProc: %s\n"
+ "%sProcSubset: %s\n",
+ prefix, c->umask,
+ prefix, empty_to_root(c->working_directory),
+ prefix, empty_to_root(c->root_directory),
+ prefix, yes_no(c->root_ephemeral),
+ prefix, yes_no(c->non_blocking),
+ prefix, yes_no(c->private_tmp),
+ prefix, yes_no(c->private_devices),
+ prefix, yes_no(c->protect_kernel_tunables),
+ prefix, yes_no(c->protect_kernel_modules),
+ prefix, yes_no(c->protect_kernel_logs),
+ prefix, yes_no(c->protect_clock),
+ prefix, yes_no(c->protect_control_groups),
+ prefix, yes_no(c->private_network),
+ prefix, yes_no(c->private_users),
+ prefix, protect_home_to_string(c->protect_home),
+ prefix, protect_system_to_string(c->protect_system),
+ prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
+ prefix, yes_no(c->ignore_sigpipe),
+ prefix, yes_no(c->memory_deny_write_execute),
+ prefix, yes_no(c->restrict_realtime),
+ prefix, yes_no(c->restrict_suid_sgid),
+ prefix, exec_keyring_mode_to_string(c->keyring_mode),
+ prefix, yes_no(c->protect_hostname),
+ prefix, protect_proc_to_string(c->protect_proc),
+ prefix, proc_subset_to_string(c->proc_subset));
+
+ if (c->set_login_environment >= 0)
+ fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));
+
+ if (c->root_image)
+ fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
+
+ if (c->root_image_options) {
+ fprintf(f, "%sRootImageOptions:", prefix);
+ LIST_FOREACH(mount_options, o, c->root_image_options)
+ if (!isempty(o->options))
+ fprintf(f, " %s:%s",
+ partition_designator_to_string(o->partition_designator),
+ o->options);
+ fprintf(f, "\n");
+ }
+
+ if (c->root_hash) {
+ _cleanup_free_ char *encoded = NULL;
+ encoded = hexmem(c->root_hash, c->root_hash_size);
+ if (encoded)
+ fprintf(f, "%sRootHash: %s\n", prefix, encoded);
+ }
+
+ if (c->root_hash_path)
+ fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
+
+ if (c->root_hash_sig) {
+ _cleanup_free_ char *encoded = NULL;
+ ssize_t len;
+ len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
+ if (len)
+ fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
+ }
+
+ if (c->root_hash_sig_path)
+ fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
+
+ if (c->root_verity)
+ fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
+
+ STRV_FOREACH(e, c->environment)
+ fprintf(f, "%sEnvironment: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->environment_files)
+ fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->pass_environment)
+ fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
+
+ STRV_FOREACH(e, c->unset_environment)
+ fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
+
+ fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
+
+ for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+ fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
+
+ STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
+ fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
+ }
+ }
+
+ fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
+
+ if (c->memory_ksm >= 0)
+ fprintf(f, "%sMemoryKSM: %s\n", prefix, yes_no(c->memory_ksm > 0));
+
+ if (c->nice_set)
+ fprintf(f, "%sNice: %i\n", prefix, c->nice);
+
+ if (c->oom_score_adjust_set)
+ fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
+
+ if (c->coredump_filter_set)
+ fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
+
+ for (unsigned i = 0; i < RLIM_NLIMITS; i++)
+ if (c->rlimit[i]) {
+ fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
+ prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
+ fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
+ prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
+ }
+
+ if (c->ioprio_set) {
+ _cleanup_free_ char *class_str = NULL;
+
+ r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
+ if (r >= 0)
+ fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
+
+ fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
+ }
+
+ if (c->cpu_sched_set) {
+ _cleanup_free_ char *policy_str = NULL;
+
+ r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
+ if (r >= 0)
+ fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
+
+ fprintf(f,
+ "%sCPUSchedulingPriority: %i\n"
+ "%sCPUSchedulingResetOnFork: %s\n",
+ prefix, c->cpu_sched_priority,
+ prefix, yes_no(c->cpu_sched_reset_on_fork));
+ }
+
+ if (c->cpu_set.set) {
+ _cleanup_free_ char *affinity = NULL;
+
+ affinity = cpu_set_to_range_string(&c->cpu_set);
+ fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
+ }
+
+ if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+ _cleanup_free_ char *nodes = NULL;
+
+ nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+ fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
+ fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
+ }
+
+ if (c->timer_slack_nsec != NSEC_INFINITY)
+ fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
+
+ fprintf(f,
+ "%sStandardInput: %s\n"
+ "%sStandardOutput: %s\n"
+ "%sStandardError: %s\n",
+ prefix, exec_input_to_string(c->std_input),
+ prefix, exec_output_to_string(c->std_output),
+ prefix, exec_output_to_string(c->std_error));
+
+ if (c->std_input == EXEC_INPUT_NAMED_FD)
+ fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_NAMED_FD)
+ fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_NAMED_FD)
+ fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
+
+ if (c->std_input == EXEC_INPUT_FILE)
+ fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE)
+ fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
+ fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE)
+ fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
+ fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+
+ if (c->tty_path)
+ fprintf(f,
+ "%sTTYPath: %s\n"
+ "%sTTYReset: %s\n"
+ "%sTTYVHangup: %s\n"
+ "%sTTYVTDisallocate: %s\n"
+ "%sTTYRows: %u\n"
+ "%sTTYColumns: %u\n",
+ prefix, c->tty_path,
+ prefix, yes_no(c->tty_reset),
+ prefix, yes_no(c->tty_vhangup),
+ prefix, yes_no(c->tty_vt_disallocate),
+ prefix, c->tty_rows,
+ prefix, c->tty_cols);
+
+ if (IN_SET(c->std_output,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
+ IN_SET(c->std_error,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
+
+ _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
+
+ r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
+ if (r >= 0)
+ fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
+
+ r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
+ if (r >= 0)
+ fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
+ }
+
+ if (c->log_level_max >= 0) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) log_level_to_string_alloc(c->log_level_max, &t);
+
+ fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
+ }
+
+ if (c->log_ratelimit_interval_usec > 0)
+ fprintf(f,
+ "%sLogRateLimitIntervalSec: %s\n",
+ prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
+
+ if (c->log_ratelimit_burst > 0)
+ fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
+
+ if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
+ fprintf(f, "%sLogFilterPatterns:", prefix);
+
+ char *pattern;
+ SET_FOREACH(pattern, c->log_filter_allowed_patterns)
+ fprintf(f, " %s", pattern);
+ SET_FOREACH(pattern, c->log_filter_denied_patterns)
+ fprintf(f, " ~%s", pattern);
+ fputc('\n', f);
+ }
+
+ FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields) {
+ fprintf(f, "%sLogExtraFields: ", prefix);
+ fwrite(field->iov_base, 1, field->iov_len, f);
+ fputc('\n', f);
+ }
+
+ if (c->log_namespace)
+ fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
+
+ if (c->secure_bits) {
+ _cleanup_free_ char *str = NULL;
+
+ r = secure_bits_to_string_alloc(c->secure_bits, &str);
+ if (r >= 0)
+ fprintf(f, "%sSecure Bits: %s\n", prefix, str);
+ }
+
+ if (c->capability_bounding_set != CAP_MASK_UNSET) {
+ _cleanup_free_ char *str = NULL;
+
+ r = capability_set_to_string(c->capability_bounding_set, &str);
+ if (r >= 0)
+ fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
+ }
+
+ if (c->capability_ambient_set != 0) {
+ _cleanup_free_ char *str = NULL;
+
+ r = capability_set_to_string(c->capability_ambient_set, &str);
+ if (r >= 0)
+ fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
+ }
+
+ if (c->user)
+ fprintf(f, "%sUser: %s\n", prefix, c->user);
+ if (c->group)
+ fprintf(f, "%sGroup: %s\n", prefix, c->group);
+
+ fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
+
+ strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
+
+ if (c->pam_name)
+ fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
+
+ strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
+ strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
+ strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
+ strv_dump(f, prefix, "ExecPaths", c->exec_paths);
+ strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
+ strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
+
+ FOREACH_ARRAY(mount, c->bind_mounts, c->n_bind_mounts)
+ fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
+ mount->read_only ? "BindReadOnlyPaths" : "BindPaths",
+ mount->ignore_enoent ? "-": "",
+ mount->source,
+ mount->destination,
+ mount->recursive ? "rbind" : "norbind");
+
+ FOREACH_ARRAY(tmpfs, c->temporary_filesystems, c->n_temporary_filesystems)
+ fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
+ tmpfs->path,
+ isempty(tmpfs->options) ? "" : ":",
+ strempty(tmpfs->options));
+
+ if (c->utmp_id)
+ fprintf(f,
+ "%sUtmpIdentifier: %s\n",
+ prefix, c->utmp_id);
+
+ if (c->selinux_context)
+ fprintf(f,
+ "%sSELinuxContext: %s%s\n",
+ prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
+
+ if (c->apparmor_profile)
+ fprintf(f,
+ "%sAppArmorProfile: %s%s\n",
+ prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
+
+ if (c->smack_process_label)
+ fprintf(f,
+ "%sSmackProcessLabel: %s%s\n",
+ prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
+
+ if (c->personality != PERSONALITY_INVALID)
+ fprintf(f,
+ "%sPersonality: %s\n",
+ prefix, strna(personality_to_string(c->personality)));
+
+ fprintf(f,
+ "%sLockPersonality: %s\n",
+ prefix, yes_no(c->lock_personality));
+
+ if (c->syscall_filter) {
+ fprintf(f,
+ "%sSystemCallFilter: ",
+ prefix);
+
+ if (!c->syscall_allow_list)
+ fputc('~', f);
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ bool first = true;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+ _cleanup_free_ char *name = NULL;
+ const char *errno_name = NULL;
+ int num = PTR_TO_INT(val);
+
+ if (first)
+ first = false;
+ else
+ fputc(' ', f);
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ fputs(strna(name), f);
+
+ if (num >= 0) {
+ errno_name = seccomp_errno_or_action_to_string(num);
+ if (errno_name)
+ fprintf(f, ":%s", errno_name);
+ else
+ fprintf(f, ":%d", num);
+ }
+ }
+#endif
+
+ fputc('\n', f);
+ }
+
+ if (c->syscall_archs) {
+ fprintf(f,
+ "%sSystemCallArchitectures:",
+ prefix);
+
+#if HAVE_SECCOMP
+ void *id;
+ SET_FOREACH(id, c->syscall_archs)
+ fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
+#endif
+ fputc('\n', f);
+ }
+
+ if (exec_context_restrict_namespaces_set(c)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = namespace_flags_to_string(c->restrict_namespaces, &s);
+ if (r >= 0)
+ fprintf(f, "%sRestrictNamespaces: %s\n",
+ prefix, strna(s));
+ }
+
+#if HAVE_LIBBPF
+ if (exec_context_restrict_filesystems_set(c)) {
+ char *fs;
+ SET_FOREACH(fs, c->restrict_filesystems)
+ fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
+ }
+#endif
+
+ if (c->network_namespace_path)
+ fprintf(f,
+ "%sNetworkNamespacePath: %s\n",
+ prefix, c->network_namespace_path);
+
+ if (c->syscall_errno > 0) {
+ fprintf(f, "%sSystemCallErrorNumber: ", prefix);
+
+#if HAVE_SECCOMP
+ const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
+ if (errno_name)
+ fputs(errno_name, f);
+ else
+ fprintf(f, "%d", c->syscall_errno);
+#endif
+ fputc('\n', f);
+ }
+
+ FOREACH_ARRAY(mount, c->mount_images, c->n_mount_images) {
+ fprintf(f, "%sMountImages: %s%s:%s", prefix,
+ mount->ignore_enoent ? "-": "",
+ mount->source,
+ mount->destination);
+ LIST_FOREACH(mount_options, o, mount->mount_options)
+ fprintf(f, ":%s:%s",
+ partition_designator_to_string(o->partition_designator),
+ strempty(o->options));
+ fprintf(f, "\n");
+ }
+
+ FOREACH_ARRAY(mount, c->extension_images, c->n_extension_images) {
+ fprintf(f, "%sExtensionImages: %s%s", prefix,
+ mount->ignore_enoent ? "-": "",
+ mount->source);
+ LIST_FOREACH(mount_options, o, mount->mount_options)
+ fprintf(f, ":%s:%s",
+ partition_designator_to_string(o->partition_designator),
+ strempty(o->options));
+ fprintf(f, "\n");
+ }
+
+ strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
+}
+
+bool exec_context_maintains_privileges(const ExecContext *c) {
+ assert(c);
+
+ /* Returns true if the process forked off would run under
+ * an unchanged UID or as root. */
+
+ if (!c->user)
+ return true;
+
+ if (streq(c->user, "root") || streq(c->user, "0"))
+ return true;
+
+ return false;
+}
+
+int exec_context_get_effective_ioprio(const ExecContext *c) {
+ int p;
+
+ assert(c);
+
+ if (c->ioprio_set)
+ return c->ioprio;
+
+ p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
+ if (p < 0)
+ return IOPRIO_DEFAULT_CLASS_AND_PRIO;
+
+ return ioprio_normalize(p);
+}
+
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
+ assert(c);
+
+ /* Explicit setting wins */
+ if (c->mount_apivfs_set)
+ return c->mount_apivfs;
+
+ /* Default to "yes" if root directory or image are specified */
+ if (exec_context_with_rootfs(c))
+ return true;
+
+ return false;
+}
+
+void exec_context_free_log_extra_fields(ExecContext *c) {
+ assert(c);
+
+ FOREACH_ARRAY(field, c->log_extra_fields, c->n_log_extra_fields)
+ free(field->iov_base);
+
+ c->log_extra_fields = mfree(c->log_extra_fields);
+ c->n_log_extra_fields = 0;
+}
+
+void exec_context_revert_tty(ExecContext *c) {
+ _cleanup_close_ int fd = -EBADF;
+ const char *path;
+ struct stat st;
+ int r;
+
+ assert(c);
+
+ /* First, reset the TTY (possibly kicking everybody else from the TTY) */
+ exec_context_tty_reset(c, /* parameters= */ NULL);
+
+ /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
+ * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
+ * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
+ if (!exec_context_may_touch_tty(c))
+ return;
+
+ path = exec_context_tty_path(c);
+ if (!path)
+ return;
+
+ fd = open(path, O_PATH|O_CLOEXEC); /* Pin the inode */
+ if (fd < 0)
+ return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
+ path);
+
+ if (fstat(fd, &st) < 0)
+ return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
+
+ /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
+ * if things are a character device, since a proper check either means we'd have to open the TTY and
+ * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
+ * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
+ * with this at all? → https://github.com/systemd/systemd/issues/19213 */
+ if (!S_ISCHR(st.st_mode))
+ return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
+
+ r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
+ if (r < 0)
+ log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s to " UID_FMT ":" GID_FMT ", ignoring: %m", path, (uid_t) 0, (gid_t) TTY_GID);
+}
+
+int exec_context_get_clean_directories(
+ ExecContext *c,
+ char **prefix,
+ ExecCleanMask mask,
+ char ***ret) {
+
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(c);
+ assert(prefix);
+ assert(ret);
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!FLAGS_SET(mask, 1U << t))
+ continue;
+
+ if (!prefix[t])
+ continue;
+
+ FOREACH_ARRAY(i, c->directories[t].items, c->directories[t].n_items) {
+ char *j;
+
+ j = path_join(prefix[t], i->path);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+
+ /* Also remove private directories unconditionally. */
+ if (t != EXEC_DIRECTORY_CONFIGURATION) {
+ j = path_join(prefix[t], "private", i->path);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+ }
+
+ STRV_FOREACH(symlink, i->symlinks) {
+ j = path_join(prefix[t], *symlink);
+ if (!j)
+ return -ENOMEM;
+
+ r = strv_consume(&l, j);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ *ret = TAKE_PTR(l);
+ return 0;
+}
+
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
+ ExecCleanMask mask = 0;
+
+ assert(c);
+ assert(ret);
+
+ for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
+ if (c->directories[t].n_items > 0)
+ mask |= 1U << t;
+
+ *ret = mask;
+ return 0;
+}
+
+int exec_context_get_oom_score_adjust(const ExecContext *c) {
+ int n = 0, r;
+
+ assert(c);
+
+ if (c->oom_score_adjust_set)
+ return c->oom_score_adjust;
+
+ r = get_oom_score_adjust(&n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /proc/self/oom_score_adj, ignoring: %m");
+
+ return n;
+}
+
+uint64_t exec_context_get_coredump_filter(const ExecContext *c) {
+ _cleanup_free_ char *t = NULL;
+ uint64_t n = COREDUMP_FILTER_MASK_DEFAULT;
+ int r;
+
+ assert(c);
+
+ if (c->coredump_filter_set)
+ return c->coredump_filter;
+
+ r = read_one_line_file("/proc/self/coredump_filter", &t);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /proc/self/coredump_filter, ignoring: %m");
+ else {
+ r = safe_atoux64(t, &n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse \"%s\" from /proc/self/coredump_filter, ignoring: %m", t);
+ }
+
+ return n;
+}
+
+int exec_context_get_nice(const ExecContext *c) {
+ int n;
+
+ assert(c);
+
+ if (c->nice_set)
+ return c->nice;
+
+ errno = 0;
+ n = getpriority(PRIO_PROCESS, 0);
+ if (errno > 0) {
+ log_debug_errno(errno, "Failed to get process nice value, ignoring: %m");
+ n = 0;
+ }
+
+ return n;
+}
+
+int exec_context_get_cpu_sched_policy(const ExecContext *c) {
+ int n;
+
+ assert(c);
+
+ if (c->cpu_sched_set)
+ return c->cpu_sched_policy;
+
+ n = sched_getscheduler(0);
+ if (n < 0)
+ log_debug_errno(errno, "Failed to get scheduler policy, ignoring: %m");
+
+ return n < 0 ? SCHED_OTHER : n;
+}
+
+int exec_context_get_cpu_sched_priority(const ExecContext *c) {
+ struct sched_param p = {};
+ int r;
+
+ assert(c);
+
+ if (c->cpu_sched_set)
+ return c->cpu_sched_priority;
+
+ r = sched_getparam(0, &p);
+ if (r < 0)
+ log_debug_errno(errno, "Failed to get scheduler priority, ignoring: %m");
+
+ return r >= 0 ? p.sched_priority : 0;
+}
+
+uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c) {
+ int r;
+
+ assert(c);
+
+ if (c->timer_slack_nsec != NSEC_INFINITY)
+ return c->timer_slack_nsec;
+
+ r = prctl(PR_GET_TIMERSLACK);
+ if (r < 0)
+ log_debug_errno(r, "Failed to get timer slack, ignoring: %m");
+
+ return (uint64_t) MAX(r, 0);
+}
+
+char** exec_context_get_syscall_filter(const ExecContext *c) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(c);
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
+ _cleanup_free_ char *name = NULL;
+ const char *e = NULL;
+ char *s;
+ int num = PTR_TO_INT(val);
+
+ if (c->syscall_allow_list && num >= 0)
+ /* syscall with num >= 0 in allow-list is denied. */
+ continue;
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ if (!name)
+ continue;
+
+ if (num >= 0) {
+ e = seccomp_errno_or_action_to_string(num);
+ if (e) {
+ s = strjoin(name, ":", e);
+ if (!s)
+ return NULL;
+ } else {
+ if (asprintf(&s, "%s:%d", name, num) < 0)
+ return NULL;
+ }
+ } else
+ s = TAKE_PTR(name);
+
+ if (strv_consume(&l, s) < 0)
+ return NULL;
+ }
+
+ strv_sort(l);
+#endif
+
+ return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_syscall_archs(const ExecContext *c) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(c);
+
+#if HAVE_SECCOMP
+ void *id;
+ SET_FOREACH(id, c->syscall_archs) {
+ const char *name;
+
+ name = seccomp_arch_to_string(PTR_TO_UINT32(id) - 1);
+ if (!name)
+ continue;
+
+ if (strv_extend(&l, name) < 0)
+ return NULL;
+ }
+
+ strv_sort(l);
+#endif
+
+ return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_syscall_log(const ExecContext *c) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(c);
+
+#if HAVE_SECCOMP
+ void *id, *val;
+ HASHMAP_FOREACH_KEY(val, id, c->syscall_log) {
+ char *name = NULL;
+
+ name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
+ if (!name)
+ continue;
+
+ if (strv_consume(&l, name) < 0)
+ return NULL;
+ }
+
+ strv_sort(l);
+#endif
+
+ return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_address_families(const ExecContext *c) {
+ _cleanup_strv_free_ char **l = NULL;
+ void *af;
+
+ assert(c);
+
+ SET_FOREACH(af, c->address_families) {
+ const char *name;
+
+ name = af_to_name(PTR_TO_INT(af));
+ if (!name)
+ continue;
+
+ if (strv_extend(&l, name) < 0)
+ return NULL;
+ }
+
+ strv_sort(l);
+
+ return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+char** exec_context_get_restrict_filesystems(const ExecContext *c) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ assert(c);
+
+#if HAVE_LIBBPF
+ l = set_get_strv(c->restrict_filesystems);
+ if (!l)
+ return NULL;
+
+ strv_sort(l);
+#endif
+
+ return l ? TAKE_PTR(l) : strv_new(NULL);
+}
+
+void exec_status_start(ExecStatus *s, pid_t pid) {
+ assert(s);
+
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+
+ dual_timestamp_now(&s->start_timestamp);
+}
+
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
+ assert(s);
+
+ if (s->pid != pid)
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+
+ dual_timestamp_now(&s->exit_timestamp);
+
+ s->code = code;
+ s->status = status;
+
+ if (context && context->utmp_id)
+ (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
+}
+
+void exec_status_reset(ExecStatus *s) {
+ assert(s);
+
+ *s = (ExecStatus) {};
+}
+
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
+ assert(s);
+ assert(f);
+
+ if (s->pid <= 0)
+ return;
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sPID: "PID_FMT"\n",
+ prefix, s->pid);
+
+ if (dual_timestamp_is_set(&s->start_timestamp))
+ fprintf(f,
+ "%sStart Timestamp: %s\n",
+ prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
+
+ if (dual_timestamp_is_set(&s->exit_timestamp))
+ fprintf(f,
+ "%sExit Timestamp: %s\n"
+ "%sExit Code: %s\n"
+ "%sExit Status: %i\n",
+ prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
+ prefix, sigchld_code_to_string(s->code),
+ prefix, s->status);
+}
+
+static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
+ _cleanup_free_ char *cmd = NULL;
+ const char *prefix2;
+
+ assert(c);
+ assert(f);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
+
+ fprintf(f,
+ "%sCommand Line: %s\n",
+ prefix, strnull(cmd));
+
+ exec_status_dump(&c->exec_status, f, prefix2);
+}
+
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ LIST_FOREACH(command, i, c)
+ exec_command_dump(i, f, prefix);
+}
+
+void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
+ ExecCommand *end;
+
+ assert(l);
+ assert(e);
+
+ if (*l) {
+ /* It's kind of important, that we keep the order here */
+ end = LIST_FIND_TAIL(command, *l);
+ LIST_INSERT_AFTER(command, *l, end, e);
+ } else
+ *l = e;
+}
+
+int exec_command_set(ExecCommand *c, const char *path, ...) {
+ va_list ap;
+ char **l, *p;
+
+ assert(c);
+ assert(path);
+
+ va_start(ap, path);
+ l = strv_new_ap(path, ap);
+ va_end(ap);
+
+ if (!l)
+ return -ENOMEM;
+
+ p = strdup(path);
+ if (!p) {
+ strv_free(l);
+ return -ENOMEM;
+ }
+
+ free_and_replace(c->path, p);
+
+ return strv_free_and_replace(c->argv, l);
+}
+
+int exec_command_append(ExecCommand *c, const char *path, ...) {
+ _cleanup_strv_free_ char **l = NULL;
+ va_list ap;
+ int r;
+
+ assert(c);
+ assert(path);
+
+ va_start(ap, path);
+ l = strv_new_ap(path, ap);
+ va_end(ap);
+
+ if (!l)
+ return -ENOMEM;
+
+ r = strv_extend_strv(&c->argv, l, false);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static char *destroy_tree(char *path) {
+ if (!path)
+ return NULL;
+
+ if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
+ log_debug("Spawning process to nuke '%s'", path);
+
+ (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
+ }
+
+ return mfree(path);
+}
+
+void exec_shared_runtime_done(ExecSharedRuntime *rt) {
+ if (!rt)
+ return;
+
+ if (rt->manager)
+ (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
+
+ rt->id = mfree(rt->id);
+ rt->tmp_dir = mfree(rt->tmp_dir);
+ rt->var_tmp_dir = mfree(rt->var_tmp_dir);
+ safe_close_pair(rt->netns_storage_socket);
+ safe_close_pair(rt->ipcns_storage_socket);
+}
+
+static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
+ exec_shared_runtime_done(rt);
+
+ return mfree(rt);
+}
+
+DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
+
+ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
+ if (!rt)
+ return NULL;
+
+ assert(rt->n_ref > 0);
+ rt->n_ref--;
+
+ if (rt->n_ref > 0)
+ return NULL;
+
+ rt->tmp_dir = destroy_tree(rt->tmp_dir);
+ rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
+
+ return exec_shared_runtime_free(rt);
+}
+
+static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
+ _cleanup_free_ char *id_copy = NULL;
+ ExecSharedRuntime *n;
+
+ assert(ret);
+
+ id_copy = strdup(id);
+ if (!id_copy)
+ return -ENOMEM;
+
+ n = new(ExecSharedRuntime, 1);
+ if (!n)
+ return -ENOMEM;
+
+ *n = (ExecSharedRuntime) {
+ .id = TAKE_PTR(id_copy),
+ .netns_storage_socket = EBADF_PAIR,
+ .ipcns_storage_socket = EBADF_PAIR,
+ };
+
+ *ret = n;
+ return 0;
+}
+
+static int exec_shared_runtime_add(
+ Manager *m,
+ const char *id,
+ char **tmp_dir,
+ char **var_tmp_dir,
+ int netns_storage_socket[2],
+ int ipcns_storage_socket[2],
+ ExecSharedRuntime **ret) {
+
+ _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
+ int r;
+
+ assert(m);
+ assert(id);
+
+ /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
+
+ r = exec_shared_runtime_allocate(&rt, id);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
+ if (r < 0)
+ return r;
+
+ assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
+ rt->tmp_dir = TAKE_PTR(*tmp_dir);
+ rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
+
+ if (netns_storage_socket) {
+ rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
+ rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
+ }
+
+ if (ipcns_storage_socket) {
+ rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
+ rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
+ }
+
+ rt->manager = m;
+
+ if (ret)
+ *ret = rt;
+ /* do not remove created ExecSharedRuntime object when the operation succeeds. */
+ TAKE_PTR(rt);
+ return 0;
+}
+
+static int exec_shared_runtime_make(
+ Manager *m,
+ const ExecContext *c,
+ const char *id,
+ ExecSharedRuntime **ret) {
+
+ _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
+ _cleanup_close_pair_ int netns_storage_socket[2] = EBADF_PAIR, ipcns_storage_socket[2] = EBADF_PAIR;
+ int r;
+
+ assert(m);
+ assert(c);
+ assert(id);
+
+ /* It is not necessary to create ExecSharedRuntime object. */
+ if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
+ *ret = NULL;
+ return 0;
+ }
+
+ if (c->private_tmp &&
+ !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
+ (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
+ prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
+ r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
+ if (r < 0)
+ return r;
+ }
+
+ if (exec_needs_network_namespace(c)) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
+ return -errno;
+ }
+
+ if (exec_needs_ipc_namespace(c)) {
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
+ return -errno;
+ }
+
+ r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
+ ExecSharedRuntime *rt;
+ int r;
+
+ assert(m);
+ assert(id);
+ assert(ret);
+
+ rt = hashmap_get(m->exec_shared_runtime_by_id, id);
+ if (rt)
+ /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
+ goto ref;
+
+ if (!create) {
+ *ret = NULL;
+ return 0;
+ }
+
+ /* If not found, then create a new object. */
+ r = exec_shared_runtime_make(m, c, id, &rt);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
+ *ret = NULL;
+ return 0;
+ }
+
+ref:
+ /* increment reference counter. */
+ rt->n_ref++;
+ *ret = rt;
+ return 1;
+}
+
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
+ ExecSharedRuntime *rt;
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
+ fprintf(f, "exec-runtime=%s", rt->id);
+
+ if (rt->tmp_dir)
+ fprintf(f, " tmp-dir=%s", rt->tmp_dir);
+
+ if (rt->var_tmp_dir)
+ fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
+
+ if (rt->netns_storage_socket[0] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " netns-socket-0=%i", copy);
+ }
+
+ if (rt->netns_storage_socket[1] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " netns-socket-1=%i", copy);
+ }
+
+ if (rt->ipcns_storage_socket[0] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " ipcns-socket-0=%i", copy);
+ }
+
+ if (rt->ipcns_storage_socket[1] >= 0) {
+ int copy;
+
+ copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " ipcns-socket-1=%i", copy);
+ }
+
+ fputc('\n', f);
+ }
+
+ return 0;
+}
+
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
+ _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
+ ExecSharedRuntime *rt = NULL;
+ int r;
+
+ /* This is for the migration from old (v237 or earlier) deserialization text.
+ * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
+ * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
+ * so or not from the serialized text, then we always creates a new object owned by this. */
+
+ assert(u);
+ assert(key);
+ assert(value);
+
+ /* Manager manages ExecSharedRuntime objects by the unit id.
+ * So, we omit the serialized text when the unit does not have id (yet?)... */
+ if (isempty(u->id)) {
+ log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
+ return 0;
+ }
+
+ if (u->manager) {
+ if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
+ return log_oom();
+
+ rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
+ }
+ if (!rt) {
+ if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
+ return log_oom();
+
+ rt = rt_create;
+ }
+
+ if (streq(key, "tmp-dir")) {
+ if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
+ return -ENOMEM;
+
+ } else if (streq(key, "var-tmp-dir")) {
+ if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
+ return -ENOMEM;
+
+ } else if (streq(key, "netns-socket-0")) {
+
+ safe_close(rt->netns_storage_socket[0]);
+ rt->netns_storage_socket[0] = deserialize_fd(fds, value);
+ if (rt->netns_storage_socket[0] < 0)
+ return 0;
+
+ } else if (streq(key, "netns-socket-1")) {
+
+ safe_close(rt->netns_storage_socket[1]);
+ rt->netns_storage_socket[1] = deserialize_fd(fds, value);
+ if (rt->netns_storage_socket[1] < 0)
+ return 0;
+ } else
+ return 0;
+
+ /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
+ if (rt_create && u->manager) {
+ r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
+ return 0;
+ }
+
+ rt_create->manager = u->manager;
+
+ /* Avoid cleanup */
+ TAKE_PTR(rt_create);
+ }
+
+ return 1;
+}
+
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
+ _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
+ char *id = NULL;
+ int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
+ const char *p, *v = ASSERT_PTR(value);
+ size_t n;
+
+ assert(m);
+ assert(fds);
+
+ n = strcspn(v, " ");
+ id = strndupa_safe(v, n);
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+
+ v = startswith(p, "tmp-dir=");
+ if (v) {
+ n = strcspn(v, " ");
+ tmp_dir = strndup(v, n);
+ if (!tmp_dir)
+ return log_oom();
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "var-tmp-dir=");
+ if (v) {
+ n = strcspn(v, " ");
+ var_tmp_dir = strndup(v, n);
+ if (!var_tmp_dir)
+ return log_oom();
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "netns-socket-0=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ netns_fdpair[0] = deserialize_fd(fds, buf);
+ if (netns_fdpair[0] < 0)
+ return netns_fdpair[0];
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "netns-socket-1=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ netns_fdpair[1] = deserialize_fd(fds, buf);
+ if (netns_fdpair[1] < 0)
+ return netns_fdpair[1];
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "ipcns-socket-0=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ ipcns_fdpair[0] = deserialize_fd(fds, buf);
+ if (ipcns_fdpair[0] < 0)
+ return ipcns_fdpair[0];
+ if (v[n] != ' ')
+ goto finalize;
+ p = v + n + 1;
+ }
+
+ v = startswith(p, "ipcns-socket-1=");
+ if (v) {
+ char *buf;
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ ipcns_fdpair[1] = deserialize_fd(fds, buf);
+ if (ipcns_fdpair[1] < 0)
+ return ipcns_fdpair[1];
+ }
+
+finalize:
+ r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add exec-runtime: %m");
+ return 0;
+}
+
+void exec_shared_runtime_vacuum(Manager *m) {
+ ExecSharedRuntime *rt;
+
+ assert(m);
+
+ /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
+
+ HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
+ if (rt->n_ref > 0)
+ continue;
+
+ (void) exec_shared_runtime_free(rt);
+ }
+}
+
+int exec_runtime_make(
+ const Unit *unit,
+ const ExecContext *context,
+ ExecSharedRuntime *shared,
+ DynamicCreds *creds,
+ ExecRuntime **ret) {
+ _cleanup_close_pair_ int ephemeral_storage_socket[2] = EBADF_PAIR;
+ _cleanup_free_ char *ephemeral = NULL;
+ _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+ int r;
+
+ assert(unit);
+ assert(context);
+ assert(ret);
+
+ if (!shared && !creds && !exec_needs_ephemeral(context)) {
+ *ret = NULL;
+ return 0;
+ }
+
+ if (exec_needs_ephemeral(context)) {
+ r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
+ if (r < 0)
+ return r;
+
+ r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
+ if (r < 0)
+ return r;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
+ return -errno;
+ }
+
+ rt = new(ExecRuntime, 1);
+ if (!rt)
+ return -ENOMEM;
+
+ *rt = (ExecRuntime) {
+ .shared = shared,
+ .dynamic_creds = creds,
+ .ephemeral_copy = TAKE_PTR(ephemeral),
+ .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
+ .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
+ };
+
+ *ret = TAKE_PTR(rt);
+ return 1;
+}
+
+ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
+ if (!rt)
+ return NULL;
+
+ exec_shared_runtime_unref(rt->shared);
+ dynamic_creds_unref(rt->dynamic_creds);
+
+ rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
+
+ safe_close_pair(rt->ephemeral_storage_socket);
+ return mfree(rt);
+}
+
+ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
+ if (!rt)
+ return NULL;
+
+ rt->shared = exec_shared_runtime_destroy(rt->shared);
+ rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
+ return exec_runtime_free(rt);
+}
+
+void exec_runtime_clear(ExecRuntime *rt) {
+ if (!rt)
+ return;
+
+ safe_close_pair(rt->ephemeral_storage_socket);
+ rt->ephemeral_copy = mfree(rt->ephemeral_copy);
+}
+
+void exec_params_shallow_clear(ExecParameters *p) {
+ if (!p)
+ return;
+
+ /* This is called on the PID1 side, as many of the struct's FDs are only borrowed, and actually
+ * owned by the manager or other objects, and reused across multiple units. */
+
+ p->environment = strv_free(p->environment);
+ p->fd_names = strv_free(p->fd_names);
+ p->files_env = strv_free(p->files_env);
+ p->fds = mfree(p->fds);
+ p->exec_fd = safe_close(p->exec_fd);
+ p->user_lookup_fd = -EBADF;
+ p->bpf_outer_map_fd = -EBADF;
+ p->unit_id = mfree(p->unit_id);
+ p->invocation_id = SD_ID128_NULL;
+ p->invocation_id_string[0] = '\0';
+ p->confirm_spawn = mfree(p->confirm_spawn);
+}
+
+void exec_params_deep_clear(ExecParameters *p) {
+ if (!p)
+ return;
+
+ /* This is called on the sd-executor side, where everything received is owned by the process and has
+ * to be fully cleaned up to make sanitizers and analyzers happy, as opposed as the shallow clean
+ * function above. */
+
+ close_many_unset(p->fds, p->n_socket_fds + p->n_storage_fds);
+
+ p->cgroup_path = mfree(p->cgroup_path);
+
+ if (p->prefix) {
+ free_many_charp(p->prefix, _EXEC_DIRECTORY_TYPE_MAX);
+ p->prefix = mfree(p->prefix);
+ }
+
+ p->received_credentials_directory = mfree(p->received_credentials_directory);
+ p->received_encrypted_credentials_directory = mfree(p->received_encrypted_credentials_directory);
+
+ if (p->idle_pipe) {
+ close_many_and_free(p->idle_pipe, 4);
+ p->idle_pipe = NULL;
+ }
+
+ p->stdin_fd = safe_close(p->stdin_fd);
+ p->stdout_fd = safe_close(p->stdout_fd);
+ p->stderr_fd = safe_close(p->stderr_fd);
+
+ p->notify_socket = mfree(p->notify_socket);
+
+ open_file_free_many(&p->open_files);
+
+ p->fallback_smack_process_label = mfree(p->fallback_smack_process_label);
+
+ exec_params_shallow_clear(p);
+}
+
+void exec_directory_done(ExecDirectory *d) {
+ if (!d)
+ return;
+
+ FOREACH_ARRAY(i, d->items, d->n_items) {
+ free(i->path);
+ strv_free(i->symlinks);
+ }
+
+ d->items = mfree(d->items);
+ d->n_items = 0;
+ d->mode = 0755;
+}
+
+static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
+ assert(d);
+ assert(path);
+
+ FOREACH_ARRAY(i, d->items, d->n_items)
+ if (path_equal(i->path, path))
+ return i;
+
+ return NULL;
+}
+
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
+ _cleanup_strv_free_ char **s = NULL;
+ _cleanup_free_ char *p = NULL;
+ ExecDirectoryItem *existing;
+ int r;
+
+ assert(d);
+ assert(path);
+
+ existing = exec_directory_find(d, path);
+ if (existing) {
+ r = strv_extend(&existing->symlinks, symlink);
+ if (r < 0)
+ return r;
+
+ return 0; /* existing item is updated */
+ }
+
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+
+ if (symlink) {
+ s = strv_new(symlink);
+ if (!s)
+ return -ENOMEM;
+ }
+
+ if (!GREEDY_REALLOC(d->items, d->n_items + 1))
+ return -ENOMEM;
+
+ d->items[d->n_items++] = (ExecDirectoryItem) {
+ .path = TAKE_PTR(p),
+ .symlinks = TAKE_PTR(s),
+ };
+
+ return 1; /* new item is added */
+}
+
+static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
+ assert(a);
+ assert(b);
+
+ return path_compare(a->path, b->path);
+}
+
+void exec_directory_sort(ExecDirectory *d) {
+ assert(d);
+
+ /* Sort the exec directories to make always parent directories processed at first in
+ * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
+ * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
+ * list. See also comments in setup_exec_directory() and issue #24783. */
+
+ if (d->n_items <= 1)
+ return;
+
+ typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
+
+ for (size_t i = 1; i < d->n_items; i++)
+ for (size_t j = 0; j < i; j++)
+ if (path_startswith(d->items[i].path, d->items[j].path)) {
+ d->items[i].only_create = true;
+ break;
+ }
+}
+
+ExecCleanMask exec_clean_mask_from_string(const char *s) {
+ ExecDirectoryType t;
+
+ assert(s);
+
+ if (streq(s, "all"))
+ return EXEC_CLEAN_ALL;
+ if (streq(s, "fdstore"))
+ return EXEC_CLEAN_FDSTORE;
+
+ t = exec_resource_type_from_string(s);
+ if (t < 0)
+ return (ExecCleanMask) t;
+
+ return 1U << t;
+}
+
+static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
+ [EXEC_INPUT_NULL] = "null",
+ [EXEC_INPUT_TTY] = "tty",
+ [EXEC_INPUT_TTY_FORCE] = "tty-force",
+ [EXEC_INPUT_TTY_FAIL] = "tty-fail",
+ [EXEC_INPUT_SOCKET] = "socket",
+ [EXEC_INPUT_NAMED_FD] = "fd",
+ [EXEC_INPUT_DATA] = "data",
+ [EXEC_INPUT_FILE] = "file",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
+
+static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
+ [EXEC_OUTPUT_INHERIT] = "inherit",
+ [EXEC_OUTPUT_NULL] = "null",
+ [EXEC_OUTPUT_TTY] = "tty",
+ [EXEC_OUTPUT_KMSG] = "kmsg",
+ [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
+ [EXEC_OUTPUT_JOURNAL] = "journal",
+ [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
+ [EXEC_OUTPUT_SOCKET] = "socket",
+ [EXEC_OUTPUT_NAMED_FD] = "fd",
+ [EXEC_OUTPUT_FILE] = "file",
+ [EXEC_OUTPUT_FILE_APPEND] = "append",
+ [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
+
+static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
+ [EXEC_UTMP_INIT] = "init",
+ [EXEC_UTMP_LOGIN] = "login",
+ [EXEC_UTMP_USER] = "user",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
+
+static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
+ [EXEC_PRESERVE_NO] = "no",
+ [EXEC_PRESERVE_YES] = "yes",
+ [EXEC_PRESERVE_RESTART] = "restart",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
+
+/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
+static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
+ [EXEC_DIRECTORY_STATE] = "StateDirectory",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
+ [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
+
+/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
+static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
+ [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
+ [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
+
+static const char* const exec_directory_type_mode_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectoryMode",
+ [EXEC_DIRECTORY_STATE] = "StateDirectoryMode",
+ [EXEC_DIRECTORY_CACHE] = "CacheDirectoryMode",
+ [EXEC_DIRECTORY_LOGS] = "LogsDirectoryMode",
+ [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectoryMode",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_mode, ExecDirectoryType);
+
+/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
+ * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
+ * directories, specifically .timer units with their timestamp touch file. */
+static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "runtime",
+ [EXEC_DIRECTORY_STATE] = "state",
+ [EXEC_DIRECTORY_CACHE] = "cache",
+ [EXEC_DIRECTORY_LOGS] = "logs",
+ [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
+
+static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
+ [EXEC_KEYRING_INHERIT] = "inherit",
+ [EXEC_KEYRING_PRIVATE] = "private",
+ [EXEC_KEYRING_SHARED] = "shared",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
diff --git a/src/core/execute.h b/src/core/execute.h
new file mode 100644
index 0000000..5a6927a
--- /dev/null
+++ b/src/core/execute.h
@@ -0,0 +1,701 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ExecStatus ExecStatus;
+typedef struct ExecCommand ExecCommand;
+typedef struct ExecContext ExecContext;
+typedef struct ExecSharedRuntime ExecSharedRuntime;
+typedef struct DynamicCreds DynamicCreds;
+typedef struct ExecRuntime ExecRuntime;
+typedef struct ExecParameters ExecParameters;
+typedef struct Manager Manager;
+
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/capability.h>
+
+#include "cgroup-util.h"
+#include "coredump-util.h"
+#include "cpu-set-util.h"
+#include "exec-util.h"
+#include "fdset.h"
+#include "list.h"
+#include "missing_resource.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "numa-util.h"
+#include "open-file.h"
+#include "path-util.h"
+#include "runtime-scope.h"
+#include "set.h"
+#include "time-util.h"
+
+#define EXEC_STDIN_DATA_MAX (64U*1024U*1024U)
+
+typedef enum ExecUtmpMode {
+ EXEC_UTMP_INIT,
+ EXEC_UTMP_LOGIN,
+ EXEC_UTMP_USER,
+ _EXEC_UTMP_MODE_MAX,
+ _EXEC_UTMP_MODE_INVALID = -EINVAL,
+} ExecUtmpMode;
+
+typedef enum ExecInput {
+ EXEC_INPUT_NULL,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL,
+ EXEC_INPUT_SOCKET,
+ EXEC_INPUT_NAMED_FD,
+ EXEC_INPUT_DATA,
+ EXEC_INPUT_FILE,
+ _EXEC_INPUT_MAX,
+ _EXEC_INPUT_INVALID = -EINVAL,
+} ExecInput;
+
+typedef enum ExecOutput {
+ EXEC_OUTPUT_INHERIT,
+ EXEC_OUTPUT_NULL,
+ EXEC_OUTPUT_TTY,
+ EXEC_OUTPUT_KMSG,
+ EXEC_OUTPUT_KMSG_AND_CONSOLE,
+ EXEC_OUTPUT_JOURNAL,
+ EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_SOCKET,
+ EXEC_OUTPUT_NAMED_FD,
+ EXEC_OUTPUT_FILE,
+ EXEC_OUTPUT_FILE_APPEND,
+ EXEC_OUTPUT_FILE_TRUNCATE,
+ _EXEC_OUTPUT_MAX,
+ _EXEC_OUTPUT_INVALID = -EINVAL,
+} ExecOutput;
+
+typedef enum ExecPreserveMode {
+ EXEC_PRESERVE_NO,
+ EXEC_PRESERVE_YES,
+ EXEC_PRESERVE_RESTART,
+ _EXEC_PRESERVE_MODE_MAX,
+ _EXEC_PRESERVE_MODE_INVALID = -EINVAL,
+} ExecPreserveMode;
+
+typedef enum ExecKeyringMode {
+ EXEC_KEYRING_INHERIT,
+ EXEC_KEYRING_PRIVATE,
+ EXEC_KEYRING_SHARED,
+ _EXEC_KEYRING_MODE_MAX,
+ _EXEC_KEYRING_MODE_INVALID = -EINVAL,
+} ExecKeyringMode;
+
+/* Contains start and exit information about an executed command. */
+struct ExecStatus {
+ dual_timestamp start_timestamp;
+ dual_timestamp exit_timestamp;
+ pid_t pid;
+ int code; /* as in siginfo_t::si_code */
+ int status; /* as in siginfo_t::si_status */
+};
+
+/* Stores information about commands we execute. Covers both configuration settings as well as runtime data. */
+struct ExecCommand {
+ char *path;
+ char **argv;
+ ExecStatus exec_status; /* Note that this is not serialized to sd-executor */
+ ExecCommandFlags flags;
+ LIST_FIELDS(ExecCommand, command); /* useful for chaining commands */
+};
+
+/* Encapsulates certain aspects of the runtime environment that is to be shared between multiple otherwise separate
+ * invocations of commands. Specifically, this allows sharing of /tmp and /var/tmp data as well as network namespaces
+ * between invocations of commands. This is a reference counted object, with one reference taken by each currently
+ * active command invocation that wants to share this runtime. */
+struct ExecSharedRuntime {
+ unsigned n_ref;
+
+ Manager *manager;
+
+ char *id; /* Unit id of the owner */
+
+ char *tmp_dir;
+ char *var_tmp_dir;
+
+ /* An AF_UNIX socket pair, that contains a datagram containing a file descriptor referring to the network
+ * namespace. */
+ int netns_storage_socket[2];
+
+ /* Like netns_storage_socket, but the file descriptor is referring to the IPC namespace. */
+ int ipcns_storage_socket[2];
+};
+
+struct ExecRuntime {
+ ExecSharedRuntime *shared;
+ DynamicCreds *dynamic_creds;
+
+ /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
+ char *ephemeral_copy;
+
+ /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
+ * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
+ * until we're done using it. */
+ int ephemeral_storage_socket[2];
+};
+
+typedef enum ExecDirectoryType {
+ EXEC_DIRECTORY_RUNTIME,
+ EXEC_DIRECTORY_STATE,
+ EXEC_DIRECTORY_CACHE,
+ EXEC_DIRECTORY_LOGS,
+ EXEC_DIRECTORY_CONFIGURATION,
+ _EXEC_DIRECTORY_TYPE_MAX,
+ _EXEC_DIRECTORY_TYPE_INVALID = -EINVAL,
+} ExecDirectoryType;
+
+typedef struct ExecDirectoryItem {
+ char *path;
+ char **symlinks;
+ bool only_create;
+} ExecDirectoryItem;
+
+typedef struct ExecDirectory {
+ mode_t mode;
+ size_t n_items;
+ ExecDirectoryItem *items;
+} ExecDirectory;
+
+typedef enum ExecCleanMask {
+ /* In case you wonder why the bitmask below doesn't use "directory" in its name: we want to keep this
+ * generic so that .timer timestamp files can nicely be covered by this too, and similar. */
+ EXEC_CLEAN_RUNTIME = 1U << EXEC_DIRECTORY_RUNTIME,
+ EXEC_CLEAN_STATE = 1U << EXEC_DIRECTORY_STATE,
+ EXEC_CLEAN_CACHE = 1U << EXEC_DIRECTORY_CACHE,
+ EXEC_CLEAN_LOGS = 1U << EXEC_DIRECTORY_LOGS,
+ EXEC_CLEAN_CONFIGURATION = 1U << EXEC_DIRECTORY_CONFIGURATION,
+ EXEC_CLEAN_FDSTORE = 1U << _EXEC_DIRECTORY_TYPE_MAX,
+ EXEC_CLEAN_NONE = 0,
+ EXEC_CLEAN_ALL = (1U << (_EXEC_DIRECTORY_TYPE_MAX+1)) - 1,
+ _EXEC_CLEAN_MASK_INVALID = -EINVAL,
+} ExecCleanMask;
+
+/* Encodes configuration parameters applied to invoked commands. Does not carry runtime data, but only configuration
+ * changes sourced from unit files and suchlike. ExecContext objects are usually embedded into Unit objects, and do not
+ * change after being loaded. */
+struct ExecContext {
+ char **environment;
+ char **environment_files;
+ char **pass_environment;
+ char **unset_environment;
+
+ struct rlimit *rlimit[_RLIMIT_MAX];
+ char *working_directory, *root_directory, *root_image, *root_verity, *root_hash_path, *root_hash_sig_path;
+ void *root_hash, *root_hash_sig;
+ size_t root_hash_size, root_hash_sig_size;
+ LIST_HEAD(MountOptions, root_image_options);
+ bool root_ephemeral;
+ bool working_directory_missing_ok:1;
+ bool working_directory_home:1;
+
+ bool oom_score_adjust_set:1;
+ bool coredump_filter_set:1;
+ bool nice_set:1;
+ bool ioprio_set:1;
+ bool cpu_sched_set:1;
+ bool mount_apivfs_set:1;
+
+ /* This is not exposed to the user but available internally. We need it to make sure that whenever we
+ * spawn /usr/bin/mount it is run in the same process group as us so that the autofs logic detects
+ * that it belongs to us and we don't enter a trigger loop. */
+ bool same_pgrp;
+
+ bool cpu_sched_reset_on_fork;
+ bool non_blocking;
+
+ mode_t umask;
+ int oom_score_adjust;
+ int nice;
+ int ioprio;
+ int cpu_sched_policy;
+ int cpu_sched_priority;
+ uint64_t coredump_filter;
+
+ CPUSet cpu_set;
+ NUMAPolicy numa_policy;
+ bool cpu_affinity_from_numa;
+
+ ExecInput std_input;
+ ExecOutput std_output;
+ ExecOutput std_error;
+
+ /* At least one of stdin/stdout/stderr was initialized from an fd passed in. This boolean survives
+ * the fds being closed. This only makes sense for transient units. */
+ bool stdio_as_fds;
+
+ char *stdio_fdname[3];
+ char *stdio_file[3];
+
+ void *stdin_data;
+ size_t stdin_data_size;
+
+ nsec_t timer_slack_nsec;
+
+ char *tty_path;
+
+ bool tty_reset;
+ bool tty_vhangup;
+ bool tty_vt_disallocate;
+
+ unsigned tty_rows;
+ unsigned tty_cols;
+
+ bool ignore_sigpipe;
+
+ ExecKeyringMode keyring_mode;
+
+ /* Since resolving these names might involve socket
+ * connections and we don't want to deadlock ourselves these
+ * names are resolved on execution only and in the child
+ * process. */
+ char *user;
+ char *group;
+ char **supplementary_groups;
+
+ int set_login_environment;
+
+ char *pam_name;
+
+ char *utmp_id;
+ ExecUtmpMode utmp_mode;
+
+ bool no_new_privileges;
+
+ bool selinux_context_ignore;
+ bool apparmor_profile_ignore;
+ bool smack_process_label_ignore;
+
+ char *selinux_context;
+ char *apparmor_profile;
+ char *smack_process_label;
+
+ char **read_write_paths, **read_only_paths, **inaccessible_paths, **exec_paths, **no_exec_paths;
+ char **exec_search_path;
+ unsigned long mount_propagation_flag;
+ BindMount *bind_mounts;
+ size_t n_bind_mounts;
+ TemporaryFileSystem *temporary_filesystems;
+ size_t n_temporary_filesystems;
+ MountImage *mount_images;
+ size_t n_mount_images;
+ MountImage *extension_images;
+ size_t n_extension_images;
+ char **extension_directories;
+
+ uint64_t capability_bounding_set;
+ uint64_t capability_ambient_set;
+ int secure_bits;
+
+ int syslog_priority;
+ bool syslog_level_prefix;
+ char *syslog_identifier;
+
+ struct iovec* log_extra_fields;
+ size_t n_log_extra_fields;
+ Set *log_filter_allowed_patterns;
+ Set *log_filter_denied_patterns;
+
+ usec_t log_ratelimit_interval_usec;
+ unsigned log_ratelimit_burst;
+
+ int log_level_max;
+
+ char *log_namespace;
+
+ ProtectProc protect_proc; /* hidepid= */
+ ProcSubset proc_subset; /* subset= */
+
+ int private_mounts;
+ int memory_ksm;
+ bool private_tmp;
+ bool private_network;
+ bool private_devices;
+ bool private_users;
+ bool private_ipc;
+ bool protect_kernel_tunables;
+ bool protect_kernel_modules;
+ bool protect_kernel_logs;
+ bool protect_clock;
+ bool protect_control_groups;
+ ProtectSystem protect_system;
+ ProtectHome protect_home;
+ bool protect_hostname;
+ bool mount_apivfs;
+
+ bool dynamic_user;
+ bool remove_ipc;
+
+ bool memory_deny_write_execute;
+ bool restrict_realtime;
+ bool restrict_suid_sgid;
+
+ bool lock_personality;
+ unsigned long personality;
+
+ unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
+
+ Set *restrict_filesystems;
+ bool restrict_filesystems_allow_list:1;
+
+ Hashmap *syscall_filter;
+ Set *syscall_archs;
+ int syscall_errno;
+ bool syscall_allow_list:1;
+
+ Hashmap *syscall_log;
+ bool syscall_log_allow_list:1; /* Log listed system calls */
+
+ bool address_families_allow_list:1;
+ Set *address_families;
+
+ char *network_namespace_path;
+ char *ipc_namespace_path;
+
+ ExecDirectory directories[_EXEC_DIRECTORY_TYPE_MAX];
+ ExecPreserveMode runtime_directory_preserve_mode;
+ usec_t timeout_clean_usec;
+
+ Hashmap *set_credentials; /* output id → ExecSetCredential */
+ Hashmap *load_credentials; /* output id → ExecLoadCredential */
+ Set *import_credentials;
+
+ ImagePolicy *root_image_policy, *mount_image_policy, *extension_image_policy;
+};
+
+static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
+ assert(c);
+
+ return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
+}
+
+static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) {
+ assert(c);
+
+ return c->restrict_filesystems_allow_list ||
+ !set_isempty(c->restrict_filesystems);
+}
+
+static inline bool exec_context_with_rootfs(const ExecContext *c) {
+ assert(c);
+
+ /* Checks if RootDirectory= or RootImage= are used */
+
+ return !empty_or_root(c->root_directory) || c->root_image;
+}
+
+typedef enum ExecFlags {
+ EXEC_APPLY_SANDBOXING = 1 << 0,
+ EXEC_APPLY_CHROOT = 1 << 1,
+ EXEC_APPLY_TTY_STDIN = 1 << 2,
+ EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
+ EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
+ EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
+ EXEC_CGROUP_DELEGATE = 1 << 6,
+ EXEC_IS_CONTROL = 1 << 7,
+ EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
+ EXEC_WRITE_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
+
+ /* The following are not used by execute.c, but by consumers internally */
+ EXEC_PASS_FDS = 1 << 10,
+ EXEC_SETENV_RESULT = 1 << 11,
+ EXEC_SET_WATCHDOG = 1 << 12,
+ EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
+} ExecFlags;
+
+/* Parameters for a specific invocation of a command. This structure is put together right before a command is
+ * executed. */
+struct ExecParameters {
+ RuntimeScope runtime_scope;
+
+ char **environment;
+
+ int *fds;
+ char **fd_names;
+ size_t n_socket_fds;
+ size_t n_storage_fds;
+
+ ExecFlags flags;
+ bool selinux_context_net:1;
+
+ CGroupMask cgroup_supported;
+ char *cgroup_path;
+ uint64_t cgroup_id;
+
+ char **prefix;
+ char *received_credentials_directory;
+ char *received_encrypted_credentials_directory;
+
+ char *confirm_spawn;
+ bool shall_confirm_spawn;
+
+ usec_t watchdog_usec;
+
+ int *idle_pipe;
+
+ int stdin_fd;
+ int stdout_fd;
+ int stderr_fd;
+
+ /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
+ int exec_fd;
+
+ char *notify_socket;
+
+ LIST_HEAD(OpenFile, open_files);
+
+ char *fallback_smack_process_label;
+
+ char **files_env;
+ int user_lookup_fd;
+ int bpf_outer_map_fd;
+
+ /* Used for logging in the executor functions */
+ char *unit_id;
+ sd_id128_t invocation_id;
+ char invocation_id_string[SD_ID128_STRING_MAX];
+};
+
+#define EXEC_PARAMETERS_INIT(_flags) \
+ (ExecParameters) { \
+ .flags = (_flags), \
+ .stdin_fd = -EBADF, \
+ .stdout_fd = -EBADF, \
+ .stderr_fd = -EBADF, \
+ .exec_fd = -EBADF, \
+ .bpf_outer_map_fd = -EBADF, \
+ .user_lookup_fd = -EBADF, \
+ };
+
+#include "unit.h"
+#include "dynamic-user.h"
+
+int exec_spawn(Unit *unit,
+ ExecCommand *command,
+ const ExecContext *context,
+ ExecParameters *exec_params,
+ ExecRuntime *runtime,
+ const CGroupContext *cgroup_context,
+ pid_t *ret);
+
+void exec_command_done(ExecCommand *c);
+void exec_command_done_array(ExecCommand *c, size_t n);
+ExecCommand* exec_command_free_list(ExecCommand *c);
+void exec_command_free_array(ExecCommand **c, size_t n);
+void exec_command_reset_status_array(ExecCommand *c, size_t n);
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n);
+void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix);
+void exec_command_append_list(ExecCommand **l, ExecCommand *e);
+int exec_command_set(ExecCommand *c, const char *path, ...) _sentinel_;
+int exec_command_append(ExecCommand *c, const char *path, ...) _sentinel_;
+
+void exec_context_init(ExecContext *c);
+void exec_context_done(ExecContext *c);
+void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix);
+
+int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_root);
+int exec_context_destroy_mount_ns_dir(Unit *u);
+
+const char* exec_context_fdname(const ExecContext *c, int fd_index);
+
+bool exec_context_may_touch_console(const ExecContext *c);
+bool exec_context_maintains_privileges(const ExecContext *c);
+
+int exec_context_get_effective_ioprio(const ExecContext *c);
+bool exec_context_get_effective_mount_apivfs(const ExecContext *c);
+
+void exec_context_free_log_extra_fields(ExecContext *c);
+
+void exec_context_revert_tty(ExecContext *c);
+
+int exec_context_get_clean_directories(ExecContext *c, char **prefix, ExecCleanMask mask, char ***ret);
+int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret);
+
+const char *exec_context_tty_path(const ExecContext *context);
+int exec_context_apply_tty_size(const ExecContext *context, int tty_fd, const char *tty_path);
+void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p);
+
+uint64_t exec_context_get_rlimit(const ExecContext *c, const char *name);
+int exec_context_get_oom_score_adjust(const ExecContext *c);
+uint64_t exec_context_get_coredump_filter(const ExecContext *c);
+int exec_context_get_nice(const ExecContext *c);
+int exec_context_get_cpu_sched_policy(const ExecContext *c);
+int exec_context_get_cpu_sched_priority(const ExecContext *c);
+uint64_t exec_context_get_timer_slack_nsec(const ExecContext *c);
+char** exec_context_get_syscall_filter(const ExecContext *c);
+char** exec_context_get_syscall_archs(const ExecContext *c);
+char** exec_context_get_syscall_log(const ExecContext *c);
+char** exec_context_get_address_families(const ExecContext *c);
+char** exec_context_get_restrict_filesystems(const ExecContext *c);
+
+void exec_status_start(ExecStatus *s, pid_t pid);
+void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
+void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
+void exec_status_reset(ExecStatus *s);
+
+int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *name, bool create, ExecSharedRuntime **ret);
+ExecSharedRuntime *exec_shared_runtime_destroy(ExecSharedRuntime *r);
+ExecSharedRuntime *exec_shared_runtime_unref(ExecSharedRuntime *r);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_unref);
+
+int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds);
+int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds);
+int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
+void exec_shared_runtime_done(ExecSharedRuntime *rt);
+void exec_shared_runtime_vacuum(Manager *m);
+
+int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
+ExecRuntime* exec_runtime_free(ExecRuntime *rt);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
+ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
+void exec_runtime_clear(ExecRuntime *rt);
+
+int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
+void exec_params_shallow_clear(ExecParameters *p);
+void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
+void exec_params_deep_clear(ExecParameters *p);
+
+bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c);
+
+void exec_directory_done(ExecDirectory *d);
+int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink);
+void exec_directory_sort(ExecDirectory *d);
+bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type);
+
+ExecCleanMask exec_clean_mask_from_string(const char *s);
+
+const char* exec_output_to_string(ExecOutput i) _const_;
+ExecOutput exec_output_from_string(const char *s) _pure_;
+
+const char* exec_input_to_string(ExecInput i) _const_;
+ExecInput exec_input_from_string(const char *s) _pure_;
+
+const char* exec_utmp_mode_to_string(ExecUtmpMode i) _const_;
+ExecUtmpMode exec_utmp_mode_from_string(const char *s) _pure_;
+
+const char* exec_preserve_mode_to_string(ExecPreserveMode i) _const_;
+ExecPreserveMode exec_preserve_mode_from_string(const char *s) _pure_;
+
+const char* exec_keyring_mode_to_string(ExecKeyringMode i) _const_;
+ExecKeyringMode exec_keyring_mode_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_symlink_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_symlink_from_string(const char *s) _pure_;
+
+const char* exec_directory_type_mode_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_directory_type_mode_from_string(const char *s) _pure_;
+
+const char* exec_resource_type_to_string(ExecDirectoryType i) _const_;
+ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
+
+bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
+bool exec_needs_network_namespace(const ExecContext *context);
+bool exec_needs_ipc_namespace(const ExecContext *context);
+
+/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
+ * instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
+ * not available). */
+
+#define LOG_EXEC_ID_FIELD(ep) \
+ ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=" : "UNIT=")
+#define LOG_EXEC_ID_FIELD_FORMAT(ep) \
+ ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_UNIT=%s" : "UNIT=%s")
+#define LOG_EXEC_INVOCATION_ID_FIELD(ep) \
+ ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=" : "INVOCATION_ID=")
+#define LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep) \
+ ((ep)->runtime_scope == RUNTIME_SCOPE_USER ? "USER_INVOCATION_ID=%s" : "INVOCATION_ID=%s")
+
+#define log_exec_full_errno_zerook(ec, ep, level, error, ...) \
+ ({ \
+ const ExecContext *_c = (ec); \
+ const ExecParameters *_p = (ep); \
+ const int _l = (level); \
+ bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \
+ !(_c->log_level_max < 0 || \
+ _c->log_level_max >= LOG_PRI(_l))); \
+ LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
+ _c->n_log_extra_fields); \
+ !_do_log ? -ERRNO_VALUE(error) : \
+ log_object_internal(_l, error, PROJECT_FILE, \
+ __LINE__, __func__, \
+ LOG_EXEC_ID_FIELD(_p), \
+ _p->unit_id, \
+ LOG_EXEC_INVOCATION_ID_FIELD(_p), \
+ _p->invocation_id_string, ##__VA_ARGS__); \
+ })
+
+#define log_exec_full_errno(ec, ep, level, error, ...) \
+ ({ \
+ int _error = (error); \
+ ASSERT_NON_ZERO(_error); \
+ log_exec_full_errno_zerook(ec, ep, level, _error, ##__VA_ARGS__); \
+ })
+
+#define log_exec_full(ec, ep, level, ...) (void) log_exec_full_errno_zerook(ec, ep, level, 0, __VA_ARGS__)
+
+#define log_exec_debug(ec, ep, ...) log_exec_full(ec, ep, LOG_DEBUG, __VA_ARGS__)
+#define log_exec_info(ec, ep, ...) log_exec_full(ec, ep, LOG_INFO, __VA_ARGS__)
+#define log_exec_notice(ec, ep, ...) log_exec_full(ec, ep, LOG_NOTICE, __VA_ARGS__)
+#define log_exec_warning(ec, ep, ...) log_exec_full(ec, ep, LOG_WARNING, __VA_ARGS__)
+#define log_exec_error(ec, ep, ...) log_exec_full(ec, ep, LOG_ERR, __VA_ARGS__)
+
+#define log_exec_debug_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_DEBUG, error, __VA_ARGS__)
+#define log_exec_info_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_INFO, error, __VA_ARGS__)
+#define log_exec_notice_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_NOTICE, error, __VA_ARGS__)
+#define log_exec_warning_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_WARNING, error, __VA_ARGS__)
+#define log_exec_error_errno(ec, ep, error, ...) log_exec_full_errno(ec, ep, LOG_ERR, error, __VA_ARGS__)
+
+#define log_exec_struct_errno(ec, ep, level, error, ...) \
+ ({ \
+ const ExecContext *_c = (ec); \
+ const ExecParameters *_p = (ep); \
+ const int _l = (level); \
+ bool _do_log = !(_c->log_level_max < 0 || \
+ _c->log_level_max >= LOG_PRI(_l)); \
+ LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
+ _c->n_log_extra_fields); \
+ _do_log ? \
+ log_struct_errno(_l, error, __VA_ARGS__, LOG_EXEC_ID_FIELD_FORMAT(_p), _p->unit_id) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_exec_struct(ec, ep, level, ...) log_exec_struct_errno(ec, ep, level, 0, __VA_ARGS__)
+
+#define log_exec_struct_iovec_errno(ec, ep, level, error, iovec, n_iovec) \
+ ({ \
+ const ExecContext *_c = (ec); \
+ const ExecParameters *_p = (ep); \
+ const int _l = (level); \
+ bool _do_log = !(_c->log_level_max < 0 || \
+ _c->log_level_max >= LOG_PRI(_l)); \
+ LOG_CONTEXT_PUSH_IOV(_c->log_extra_fields, \
+ _c->n_log_extra_fields); \
+ _do_log ? \
+ log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_exec_struct_iovec(ec, ep, level, iovec, n_iovec) log_exec_struct_iovec_errno(ec, ep, level, 0, iovec, n_iovec)
+
+/* Like LOG_MESSAGE(), but with the unit name prefixed. */
+#define LOG_EXEC_MESSAGE(ep, fmt, ...) LOG_MESSAGE("%s: " fmt, (ep)->unit_id, ##__VA_ARGS__)
+#define LOG_EXEC_ID(ep) LOG_EXEC_ID_FIELD_FORMAT(ep), (ep)->unit_id
+#define LOG_EXEC_INVOCATION_ID(ep) LOG_EXEC_INVOCATION_ID_FIELD_FORMAT(ep), (ep)->invocation_id_string
+
+#define _LOG_CONTEXT_PUSH_EXEC(ec, ep, p, c) \
+ const ExecContext *c = (ec); \
+ const ExecParameters *p = (ep); \
+ LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_ID_FIELD(p), p->unit_id); \
+ LOG_CONTEXT_PUSH_KEY_VALUE(LOG_EXEC_INVOCATION_ID_FIELD(p), p->invocation_id_string); \
+ LOG_CONTEXT_PUSH_IOV(c->log_extra_fields, c->n_log_extra_fields)
+
+#define LOG_CONTEXT_PUSH_EXEC(ec, ep) \
+ _LOG_CONTEXT_PUSH_EXEC(ec, ep, UNIQ_T(p, UNIQ), UNIQ_T(c, UNIQ))
diff --git a/src/core/executor.c b/src/core/executor.c
new file mode 100644
index 0000000..b2716ef
--- /dev/null
+++ b/src/core/executor.c
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+#include <unistd.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "argv-util.h"
+#include "build.h"
+#include "exec-invoke.h"
+#include "execute-serialize.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fdset.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "getopt-defs.h"
+#include "label-util.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "selinux-util.h"
+#include "static-destruct.h"
+
+static FILE *arg_serialization = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_serialization, fclosep);
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man("systemd", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "%sSandbox and execute processes.%s\n\n"
+ " -h --help Show this help and exit\n"
+ " --version Print version string and exit\n"
+ " --log-target=TARGET Set log target (console, journal,\n"
+ " journal-or-kmsg,\n"
+ " kmsg, null)\n"
+ " --log-level=LEVEL Set log level (debug, info, notice,\n"
+ " warning, err, crit,\n"
+ " alert, emerg)\n"
+ " --log-color=BOOL Highlight important messages\n"
+ " --log-location=BOOL Include code location in messages\n"
+ " --log-time=BOOL Prefix messages with current time\n"
+ " --deserialize=FD Deserialize process config from FD\n"
+ "\nSee the %s for details.\n",
+ program_invocation_short_name,
+ ansi_highlight(),
+ ansi_normal(),
+ link);
+
+ return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ COMMON_GETOPT_ARGS,
+ ARG_VERSION,
+ ARG_DESERIALIZE,
+ };
+
+ static const struct option options[] = {
+ { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
+ { "log-target", required_argument, NULL, ARG_LOG_TARGET },
+ { "log-color", required_argument, NULL, ARG_LOG_COLOR },
+ { "log-location", required_argument, NULL, ARG_LOG_LOCATION },
+ { "log-time", required_argument, NULL, ARG_LOG_TIME },
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "deserialize", required_argument, NULL, ARG_DESERIALIZE },
+ {}
+ };
+
+ int c, r;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+ switch (c) {
+ case 'h':
+ return help();
+
+ case ARG_VERSION:
+ return version();
+
+ case ARG_LOG_LEVEL:
+ r = log_set_max_level_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_TARGET:
+ r = log_set_target_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_COLOR:
+ r = log_show_color_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(
+ r,
+ "Failed to parse log color setting \"%s\": %m",
+ optarg);
+
+ break;
+
+ case ARG_LOG_LOCATION:
+ r = log_show_location_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(
+ r,
+ "Failed to parse log location setting \"%s\": %m",
+ optarg);
+
+ break;
+
+ case ARG_LOG_TIME:
+ r = log_show_time_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(
+ r,
+ "Failed to parse log time setting \"%s\": %m",
+ optarg);
+
+ break;
+
+ case ARG_DESERIALIZE: {
+ _cleanup_close_ int fd = -EBADF;
+ FILE *f;
+
+ fd = parse_fd(optarg);
+ if (fd < 0)
+ return log_error_errno(fd,
+ "Failed to parse serialization fd \"%s\": %m",
+ optarg);
+
+ r = fd_cloexec(fd, /* cloexec= */ true);
+ if (r < 0)
+ return log_error_errno(r,
+ "Failed to set serialization fd %d to close-on-exec: %m",
+ fd);
+
+ f = take_fdopen(&fd, "r");
+ if (!f)
+ return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+ safe_fclose(arg_serialization);
+ arg_serialization = f;
+
+ break;
+ }
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (!arg_serialization)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No serialization fd specified.");
+
+ return 1 /* work to do */;
+}
+
+static int run(int argc, char *argv[]) {
+ _cleanup_fdset_free_ FDSet *fdset = NULL;
+ _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {};
+ _cleanup_(exec_context_done) ExecContext context = {};
+ _cleanup_(exec_command_done) ExecCommand command = {};
+ _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0);
+ _cleanup_(exec_shared_runtime_done) ExecSharedRuntime shared = {
+ .netns_storage_socket = EBADF_PAIR,
+ .ipcns_storage_socket = EBADF_PAIR,
+ };
+ _cleanup_(dynamic_creds_done) DynamicCreds dynamic_creds = {};
+ _cleanup_(exec_runtime_clear) ExecRuntime runtime = {
+ .ephemeral_storage_socket = EBADF_PAIR,
+ .shared = &shared,
+ .dynamic_creds = &dynamic_creds,
+ };
+ int exit_status = EXIT_SUCCESS, r;
+
+ exec_context_init(&context);
+ cgroup_context_init(&cgroup_context);
+
+ /* We might be starting the journal itself, we'll be told by the caller what to do */
+ log_set_always_reopen_console(true);
+ log_set_prohibit_ipc(true);
+ log_setup();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ return r;
+
+ /* Now that we know the intended log target, allow IPC and open the final log target. */
+ log_set_prohibit_ipc(false);
+ log_open();
+
+ /* This call would collect all passed fds and enable CLOEXEC. We'll unset it in exec_invoke (flag_fds)
+ * for fds that shall be passed to the child.
+ * The serialization fd is set to CLOEXEC in parse_argv, so it's also filtered. */
+ r = fdset_new_fill(/* filter_cloexec= */ 0, &fdset);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create fd set: %m");
+
+ /* Initialize lazily. SMACK is just a few operations, but the SELinux is very slow as it requires
+ * loading the entire database in memory, so we will do it lazily only if it is actually needed, to
+ * avoid wasting 2ms-10ms for each sd-executor that gets spawned. */
+ r = mac_init_lazy();
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize MAC layer: %m");
+
+ r = exec_deserialize_invocation(arg_serialization,
+ fdset,
+ &context,
+ &command,
+ &params,
+ &runtime,
+ &cgroup_context);
+ if (r < 0)
+ return log_error_errno(r, "Failed to deserialize: %m");
+
+ arg_serialization = safe_fclose(arg_serialization);
+ fdset = fdset_free(fdset);
+
+ r = exec_invoke(&command,
+ &context,
+ &params,
+ &runtime,
+ &cgroup_context,
+ &exit_status);
+ if (r < 0) {
+ const char *status = ASSERT_PTR(
+ exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
+
+ log_exec_struct_errno(&context, &params, LOG_ERR, r,
+ "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
+ LOG_EXEC_INVOCATION_ID(&params),
+ LOG_EXEC_MESSAGE(&params, "Failed at step %s spawning %s: %m",
+ status, command.path),
+ "EXECUTABLE=%s", command.path);
+ } else
+ assert(exit_status == EXIT_SUCCESS); /* When 'skip' is chosen in the confirm spawn prompt */
+
+ return exit_status;
+}
+
+int main(int argc, char *argv[]) {
+ int r;
+
+ /* We use safe_fork() for spawning sd-pam helper process, which internally calls rename_process().
+ * As the last step of renaming, all saved argvs are memzero()-ed. Hence, we need to save the argv
+ * first to prevent showing "intense" cmdline. See #30352. */
+ save_argc_argv(argc, argv);
+
+ r = run(argc, argv);
+
+ mac_selinux_finish();
+ static_destruct();
+
+ return r < 0 ? EXIT_FAILURE : r;
+}
diff --git a/src/core/fuzz-execute-serialize.c b/src/core/fuzz-execute-serialize.c
new file mode 100644
index 0000000..6069efd
--- /dev/null
+++ b/src/core/fuzz-execute-serialize.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/* Notes on how to run the fuzzer manually:
+ * 1) Build the fuzzers with LLVM's libFuzzer and ASan+UBSan:
+ * $ CC=clang CXX=clang++ meson build-libfuzz -Db_sanitize=address,undefined -Dllvm-fuzz=true -Db_lundef=false
+ *
+ * 2) Collect some valid inputs:
+ *
+ * OUT=test/fuzz/fuzz-execute-serialize/initial
+ * for section in context command parameters runtime cgroup; do
+ * awk "match(\$0, /startswith\\(.+, \"(exec-${section}-[^\"]+=)\"/, m) { print m[1]; }" \
+ * src/core/execute-serialize.c >>"$OUT"
+ * # Each "section" is delimited by an empty line
+ * echo >>"$OUT"
+ * done
+ *
+ * 3) Run the fuzzer:
+ * $ build-libfuzz/fuzz-execute-serialize test/fuzz/fuzz-execute-serialize
+ */
+
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "execute-serialize.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "service.h"
+
+static void exec_fuzz_one(FILE *f, FDSet *fdset) {
+ _cleanup_(exec_params_deep_clear) ExecParameters params = EXEC_PARAMETERS_INIT(/* flags= */ 0);
+ _cleanup_(exec_context_done) ExecContext exec_context = {};
+ _cleanup_(cgroup_context_done) CGroupContext cgroup_context = {};
+ DynamicCreds dynamic_creds = {};
+ ExecCommand command = {};
+ ExecSharedRuntime shared = {
+ .netns_storage_socket = EBADF_PAIR,
+ .ipcns_storage_socket = EBADF_PAIR,
+ };
+ ExecRuntime runtime = {
+ .ephemeral_storage_socket = EBADF_PAIR,
+ .shared = &shared,
+ .dynamic_creds = &dynamic_creds,
+ };
+
+ exec_context_init(&exec_context);
+ cgroup_context_init(&cgroup_context);
+
+ (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, &params, &runtime, &cgroup_context);
+ (void) exec_serialize_invocation(f, fdset, &exec_context, &command, &params, &runtime, &cgroup_context);
+ (void) exec_deserialize_invocation(f, fdset, &exec_context, &command, &params, &runtime, &cgroup_context);
+
+ /* We definitely didn't provide valid FDs during deserialization, so
+ * wipe the FDs before exec_params_serialized_clear() kicks in, otherwise
+ * we'll hit the assert in safe_close() */
+ params.stdin_fd = -EBADF;
+ params.stdout_fd = -EBADF;
+ params.stderr_fd = -EBADF;
+ params.exec_fd = -EBADF;
+ params.user_lookup_fd = -EBADF;
+ params.bpf_outer_map_fd = -EBADF;
+ if (!params.fds)
+ params.n_socket_fds = params.n_storage_fds = 0;
+ for (size_t i = 0; params.fds && i < params.n_socket_fds + params.n_storage_fds; i++)
+ params.fds[i] = -EBADF;
+
+ exec_command_done_array(&command, /* n= */ 1);
+ exec_shared_runtime_done(&shared);
+ if (dynamic_creds.group != dynamic_creds.user)
+ dynamic_user_free(dynamic_creds.group);
+ dynamic_user_free(dynamic_creds.user);
+ free(runtime.ephemeral_copy);
+ safe_close_pair(runtime.ephemeral_storage_socket);
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_fdset_free_ FDSet *fdset = NULL;
+
+ if (outside_size_range(size, 0, 128 * 1024))
+ return 0;
+
+ fuzz_setup_logging();
+
+ assert_se(fdset = fdset_new());
+ assert_se(f = data_to_file(data, size));
+
+ exec_fuzz_one(f, fdset);
+
+ return 0;
+}
diff --git a/src/core/fuzz-manager-serialize.c b/src/core/fuzz-manager-serialize.c
new file mode 100644
index 0000000..57083ca
--- /dev/null
+++ b/src/core/fuzz-manager-serialize.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "manager-serialize.h"
+#include "manager.h"
+#include "service.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ _cleanup_fclose_ FILE *f = NULL, *null = NULL;
+ _cleanup_fdset_free_ FDSet *fdset = NULL;
+
+ if (outside_size_range(size, 0, 65536))
+ return 0;
+
+ fuzz_setup_logging();
+
+ assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
+ /* Set log overrides as well to make it harder for a serialization file
+ * to switch log levels/targets during fuzzing */
+ manager_override_log_level(m, log_get_max_level());
+ manager_override_log_target(m, log_get_target());
+ assert_se(null = fopen("/dev/null", "we"));
+ assert_se(fdset = fdset_new());
+ assert_se(f = data_to_file(data, size));
+
+ (void) manager_deserialize(m, f, fdset);
+ (void) manager_serialize(m, null, fdset, true);
+ (void) manager_serialize(m, null, fdset, false);
+
+ return 0;
+}
diff --git a/src/core/fuzz-manager-serialize.options b/src/core/fuzz-manager-serialize.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/core/fuzz-manager-serialize.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/core/fuzz-unit-file.c b/src/core/fuzz-unit-file.c
new file mode 100644
index 0000000..57480cf
--- /dev/null
+++ b/src/core/fuzz-unit-file.c
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fd-util.h"
+#include "fuzz.h"
+#include "install.h"
+#include "load-fragment.h"
+#include "manager-dump.h"
+#include "memstream-util.h"
+#include "string-util.h"
+#include "unit-serialize.h"
+#include "utf8.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *p = NULL;
+ UnitType t;
+ _cleanup_(manager_freep) Manager *m = NULL;
+ Unit *u;
+ const char *name;
+ long offset;
+
+ if (outside_size_range(size, 0, 65536))
+ return 0;
+
+ f = data_to_file(data, size);
+
+ assert_se(f);
+
+ if (read_line(f, LINE_MAX, &p) < 0)
+ return 0;
+
+ t = unit_type_from_string(p);
+ if (t < 0)
+ return 0;
+
+ if (!unit_vtable[t]->load)
+ return 0;
+
+ offset = ftell(f);
+ assert_se(offset >= 0);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *ll;
+
+ if (read_line(f, LONG_LINE_MAX, &l) <= 0)
+ break;
+
+ ll = startswith(l, UTF8_BYTE_ORDER_MARK) ?: l;
+ ll = ll + strspn(ll, WHITESPACE);
+
+ if (HAS_FEATURE_MEMORY_SANITIZER && startswith(ll, "ListenNetlink")) {
+ /* ListenNetlink causes a false positive in msan,
+ * let's skip this for now. */
+ log_notice("Skipping test because ListenNetlink= is present");
+ return 0;
+ }
+ }
+
+ assert_se(fseek(f, offset, SEEK_SET) == 0);
+
+ fuzz_setup_logging();
+
+ assert_se(manager_new(RUNTIME_SCOPE_SYSTEM, MANAGER_TEST_RUN_MINIMAL|MANAGER_TEST_DONT_OPEN_EXECUTOR, &m) >= 0);
+
+ name = strjoina("a.", unit_type_to_string(t));
+ assert_se(unit_new_for_name(m, unit_vtable[t]->object_size, name, &u) >= 0);
+
+ (void) config_parse(
+ name, name, f,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0,
+ u,
+ NULL);
+
+ _cleanup_(memstream_done) MemStream ms = {};
+ FILE *g;
+
+ assert_se(g = memstream_init(&ms));
+ unit_dump(u, g, "");
+ manager_dump(m, g, /* patterns= */ NULL, ">>>");
+
+ return 0;
+}
diff --git a/src/core/fuzz-unit-file.options b/src/core/fuzz-unit-file.options
new file mode 100644
index 0000000..678d526
--- /dev/null
+++ b/src/core/fuzz-unit-file.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+max_len = 65536
diff --git a/src/core/generator-setup.c b/src/core/generator-setup.c
new file mode 100644
index 0000000..00d6ad6
--- /dev/null
+++ b/src/core/generator-setup.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "generator-setup.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "rm-rf.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p) {
+ int r, q;
+
+ assert(p);
+
+ if (!p->generator || !p->generator_early || !p->generator_late)
+ return -EINVAL;
+
+ r = mkdir_p_label(p->generator, 0755);
+
+ q = mkdir_p_label(p->generator_early, 0755);
+ if (q < 0 && r >= 0)
+ r = q;
+
+ q = mkdir_p_label(p->generator_late, 0755);
+ if (q < 0 && r >= 0)
+ r = q;
+
+ return r;
+}
+
+void lookup_paths_trim_generator(LookupPaths *p) {
+ assert(p);
+
+ /* Trim empty dirs */
+
+ if (p->generator)
+ (void) rmdir(p->generator);
+ if (p->generator_early)
+ (void) rmdir(p->generator_early);
+ if (p->generator_late)
+ (void) rmdir(p->generator_late);
+}
+
+void lookup_paths_flush_generator(LookupPaths *p) {
+ assert(p);
+
+ /* Flush the generated unit files in full */
+
+ if (p->generator)
+ (void) rm_rf(p->generator, REMOVE_ROOT|REMOVE_PHYSICAL);
+ if (p->generator_early)
+ (void) rm_rf(p->generator_early, REMOVE_ROOT|REMOVE_PHYSICAL);
+ if (p->generator_late)
+ (void) rm_rf(p->generator_late, REMOVE_ROOT|REMOVE_PHYSICAL);
+
+ if (p->temporary_dir)
+ (void) rm_rf(p->temporary_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
+}
diff --git a/src/core/generator-setup.h b/src/core/generator-setup.h
new file mode 100644
index 0000000..1cc816b
--- /dev/null
+++ b/src/core/generator-setup.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "path-lookup.h"
+
+int lookup_paths_mkdir_generator(LookupPaths *p);
+void lookup_paths_trim_generator(LookupPaths *p);
+void lookup_paths_flush_generator(LookupPaths *p);
diff --git a/src/core/ima-setup.c b/src/core/ima-setup.c
new file mode 100644
index 0000000..37916bb
--- /dev/null
+++ b/src/core/ima-setup.c
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+ TORSEC group — http://security.polito.it
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "ima-setup.h"
+#include "log.h"
+
+#define IMA_SECFS_DIR "/sys/kernel/security/ima"
+#define IMA_SECFS_POLICY IMA_SECFS_DIR "/policy"
+#define IMA_POLICY_PATH "/etc/ima/ima-policy"
+
+int ima_setup(void) {
+#if ENABLE_IMA
+ _cleanup_fclose_ FILE *input = NULL;
+ _cleanup_close_ int imafd = -EBADF;
+ unsigned lineno = 0;
+ int r;
+
+ if (access(IMA_SECFS_DIR, F_OK) < 0) {
+ log_debug_errno(errno, "IMA support is disabled in the kernel, ignoring: %m");
+ return 0;
+ }
+
+ if (access(IMA_SECFS_POLICY, W_OK) < 0) {
+ log_warning_errno(errno, "Another IMA custom policy has already been loaded, ignoring: %m");
+ return 0;
+ }
+
+ if (access(IMA_POLICY_PATH, F_OK) < 0) {
+ log_debug_errno(errno, "No IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+ return 0;
+ }
+
+ imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+ if (imafd < 0) {
+ log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+ return 0;
+ }
+
+ /* attempt to write the name of the policy file into sysfs file */
+ if (write(imafd, IMA_POLICY_PATH, STRLEN(IMA_POLICY_PATH)) > 0)
+ goto done;
+
+ /* fall back to copying the policy line-by-line */
+ input = fopen(IMA_POLICY_PATH, "re");
+ if (!input) {
+ log_warning_errno(errno, "Failed to open the IMA custom policy file "IMA_POLICY_PATH", ignoring: %m");
+ return 0;
+ }
+
+ safe_close(imafd);
+
+ imafd = open(IMA_SECFS_POLICY, O_WRONLY|O_CLOEXEC);
+ if (imafd < 0) {
+ log_error_errno(errno, "Failed to open the IMA kernel interface "IMA_SECFS_POLICY", ignoring: %m");
+ return 0;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ size_t len;
+
+ r = read_line(input, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read the IMA custom policy file "IMA_POLICY_PATH": %m");
+ if (r == 0)
+ break;
+
+ len = strlen(line);
+ lineno++;
+
+ if (len > 0 && write(imafd, line, len) < 0)
+ return log_error_errno(errno, "Failed to load the IMA custom policy file "IMA_POLICY_PATH"%u: %m",
+ lineno);
+ }
+
+done:
+ log_info("Successfully loaded the IMA custom policy "IMA_POLICY_PATH".");
+#endif /* ENABLE_IMA */
+ return 0;
+}
diff --git a/src/core/ima-setup.h b/src/core/ima-setup.h
new file mode 100644
index 0000000..f964c7b
--- /dev/null
+++ b/src/core/ima-setup.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2012 Roberto Sassu - Politecnico di Torino, Italy
+ TORSEC group — http://security.polito.it
+***/
+
+int ima_setup(void);
diff --git a/src/core/import-creds.c b/src/core/import-creds.c
new file mode 100644
index 0000000..48f3160
--- /dev/null
+++ b/src/core/import-creds.c
@@ -0,0 +1,938 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+
+#include "confidential-virt.h"
+#include "copy.h"
+#include "creds-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "import-creds.h"
+#include "initrd-util.h"
+#include "io-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "recurse-dir.h"
+#include "strv.h"
+#include "virt.h"
+
+/* This imports credentials passed in from environments higher up (VM manager, boot loader, …) and rearranges
+ * them so that later code can access them using our regular credential protocol
+ * (i.e. $CREDENTIALS_DIRECTORY). It's supposed to be minimal glue to unify behaviour how PID 1 (and
+ * generators invoked by it) can acquire credentials from outside, to mimic how we support it for containers,
+ * but on VM/physical environments.
+ *
+ * This does four things:
+ *
+ * 1. It imports credentials picked up by sd-boot (and placed in the /.extra/credentials/ dir in the initrd)
+ * and puts them in /run/credentials/@encrypted/. Note that during the initrd→host transition the initrd root
+ * file system is cleaned out, thus it is essential we pick up these files before they are deleted. Note
+ * that these credentials originate from an untrusted source, i.e. the ESP and are not
+ * pre-authenticated. They still have to be authenticated before use.
+ *
+ * 2. It imports credentials from /proc/cmdline and puts them in /run/credentials/@system/. These come from a
+ * trusted environment (i.e. the boot loader), and are typically authenticated (if authentication is done
+ * at all). However, they are world-readable, which might be less than ideal. Hence only use this for data
+ * that doesn't require trust.
+ *
+ * 3. It imports credentials passed in through qemu's fw_cfg logic. Specifically, credential data passed in
+ * /sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials/ is picked up and also placed in
+ * /run/credentials/@system/.
+ *
+ * 4. It imports credentials passed in via the DMI/SMBIOS OEM string tables, quite similar to fw_cfg. It
+ * looks for strings starting with "io.systemd.credential:" and "io.systemd.credential.binary:". Both
+ * expect a key=value assignment, but in the latter case the value is Base64 decoded, allowing binary
+ * credentials to be passed in.
+ *
+ * If it picked up any credentials it will set the $CREDENTIALS_DIRECTORY and
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables to point to these directories, so that processes
+ * can find them there later on. If "ramfs" is available $CREDENTIALS_DIRECTORY will be backed by it (but
+ * $ENCRYPTED_CREDENTIALS_DIRECTORY is just a regular tmpfs).
+ *
+ * Net result: the service manager can pick up trusted credentials from $CREDENTIALS_DIRECTORY afterwards,
+ * and untrusted ones from $ENCRYPTED_CREDENTIALS_DIRECTORY. */
+
+typedef struct ImportCredentialContext {
+ int target_dir_fd;
+ size_t size_sum;
+ unsigned n_credentials;
+} ImportCredentialContext;
+
+static void import_credentials_context_free(ImportCredentialContext *c) {
+ assert(c);
+
+ c->target_dir_fd = safe_close(c->target_dir_fd);
+}
+
+static int acquire_credential_directory(ImportCredentialContext *c, const char *path, bool with_mount) {
+ int r;
+
+ assert(c);
+ assert(path);
+
+ if (c->target_dir_fd >= 0)
+ return c->target_dir_fd;
+
+ r = path_is_mount_point(path, NULL, 0);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_error_errno(r, "Failed to determine if %s is a mount point: %m", path);
+
+ r = mkdir_safe_label(path, 0700, 0, 0, MKDIR_WARN_MODE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s mount point: %m", path);
+
+ r = 0; /* Now it exists and is not a mount point */
+ }
+ if (r > 0)
+ /* If already a mount point, then remount writable */
+ (void) mount_nofollow_verbose(LOG_WARNING, NULL, path, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
+ else if (with_mount)
+ /* If not a mount point yet, and the credentials are not encrypted, then let's try to mount a no-swap fs there */
+ (void) mount_credentials_fs(path, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
+
+ c->target_dir_fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (c->target_dir_fd < 0)
+ return log_error_errno(errno, "Failed to open %s: %m", path);
+
+ return c->target_dir_fd;
+}
+
+static int open_credential_file_for_write(int target_dir_fd, const char *dir_name, const char *n) {
+ int fd;
+
+ assert(target_dir_fd >= 0);
+ assert(dir_name);
+ assert(n);
+
+ fd = openat(target_dir_fd, n, O_WRONLY|O_CLOEXEC|O_CREAT|O_EXCL|O_NOFOLLOW, 0400);
+ if (fd < 0) {
+ if (errno == EEXIST) /* In case of EEXIST we'll only debug log! */
+ return log_debug_errno(errno, "Credential '%s' set twice, ignoring.", n);
+
+ return log_error_errno(errno, "Failed to create %s/%s: %m", dir_name, n);
+ }
+
+ return fd;
+}
+
+static bool credential_size_ok(ImportCredentialContext *c, const char *name, uint64_t size) {
+ assert(c);
+ assert(name);
+
+ if (size > CREDENTIAL_SIZE_MAX) {
+ log_warning("Credential '%s' is larger than allowed limit (%s > %s), skipping.", name, FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIAL_SIZE_MAX));
+ return false;
+ }
+
+ if (size > CREDENTIALS_TOTAL_SIZE_MAX - c->size_sum) {
+ log_warning("Accumulated credential size would be above allowed limit (%s+%s > %s), skipping '%s'.",
+ FORMAT_BYTES(c->size_sum), FORMAT_BYTES(size), FORMAT_BYTES(CREDENTIALS_TOTAL_SIZE_MAX), name);
+ return false;
+ }
+
+ return true;
+}
+
+static int finalize_credentials_dir(const char *dir, const char *envvar) {
+ int r;
+
+ assert(dir);
+ assert(envvar);
+
+ /* Try to make the credentials directory read-only now */
+
+ r = make_mount_point(dir);
+ if (r < 0)
+ log_warning_errno(r, "Failed to make '%s' a mount point, ignoring: %m", dir);
+ else
+ (void) mount_nofollow_verbose(LOG_WARNING, NULL, dir, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
+
+ if (setenv(envvar, dir, /* overwrite= */ true) < 0)
+ return log_error_errno(errno, "Failed to set $%s environment variable: %m", envvar);
+
+ return 0;
+}
+
+static int import_credentials_boot(void) {
+ _cleanup_(import_credentials_context_free) ImportCredentialContext context = {
+ .target_dir_fd = -EBADF,
+ };
+ int r;
+
+ /* systemd-stub will wrap sidecar *.cred files from the UEFI kernel image directory into initrd
+ * cpios, so that they unpack into /.extra/. We'll pick them up from there and copy them into /run/
+ * so that we can access them during the entire runtime (note that the initrd file system is erased
+ * during the initrd → host transition). Note that these credentials originate from an untrusted
+ * source (i.e. the ESP typically) and thus need to be authenticated later. We thus put them in a
+ * directory separate from the usual credentials which are from a trusted source. */
+
+ if (!in_initrd())
+ return 0;
+
+ FOREACH_STRING(p,
+ "/.extra/credentials/", /* specific to this boot menu */
+ "/.extra/global_credentials/") { /* boot partition wide */
+
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int source_dir_fd = -EBADF;
+
+ source_dir_fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (source_dir_fd < 0) {
+ if (errno == ENOENT) {
+ log_debug("No credentials passed via %s.", p);
+ continue;
+ }
+
+ log_warning_errno(errno, "Failed to open '%s', ignoring: %m", p);
+ continue;
+ }
+
+ r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '%s' contents, ignoring: %m", p);
+ continue;
+ }
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ const struct dirent *d = de->entries[i];
+ _cleanup_close_ int cfd = -EBADF, nfd = -EBADF;
+ _cleanup_free_ char *n = NULL;
+ const char *e;
+ struct stat st;
+
+ e = endswith(d->d_name, ".cred");
+ if (!e)
+ continue;
+
+ /* drop .cred suffix (which we want in the ESP sidecar dir, but not for our internal
+ * processing) */
+ n = strndup(d->d_name, e - d->d_name);
+ if (!n)
+ return log_oom();
+
+ if (!credential_name_valid(n)) {
+ log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+ continue;
+ }
+
+ cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC);
+ if (cfd < 0) {
+ log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (fstat(cfd, &st) < 0) {
+ log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = stat_verify_regular(&st);
+ if (r < 0) {
+ log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (!credential_size_ok(&context, n, st.st_size))
+ continue;
+
+ r = acquire_credential_directory(&context, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ false);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(context.target_dir_fd, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, n);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = copy_bytes(cfd, nfd, st.st_size, 0);
+ if (r < 0) {
+ (void) unlinkat(context.target_dir_fd, n, 0);
+ return log_error_errno(r, "Failed to create credential '%s': %m", n);
+ }
+
+ context.size_sum += st.st_size;
+ context.n_credentials++;
+
+ log_debug("Successfully copied boot credential '%s'.", n);
+ }
+ }
+
+ if (context.n_credentials > 0) {
+ log_debug("Imported %u credentials from boot loader.", context.n_credentials);
+
+ r = finalize_credentials_dir(ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, "ENCRYPTED_CREDENTIALS_DIRECTORY");
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int proc_cmdline_callback(const char *key, const char *value, void *data) {
+ ImportCredentialContext *c = ASSERT_PTR(data);
+ _cleanup_free_ void *binary = NULL;
+ _cleanup_free_ char *n = NULL;
+ _cleanup_close_ int nfd = -EBADF;
+ const char *colon, *d;
+ bool base64;
+ size_t l;
+ int r;
+
+ assert(key);
+
+ if (proc_cmdline_key_streq(key, "systemd.set_credential"))
+ base64 = false;
+ else if (proc_cmdline_key_streq(key, "systemd.set_credential_binary"))
+ base64 = true;
+ else
+ return 0;
+
+ colon = value ? strchr(value, ':') : NULL;
+ if (!colon) {
+ log_warning("Credential assignment through kernel command line lacks ':' character, ignoring: %s", value);
+ return 0;
+ }
+
+ n = strndup(value, colon - value);
+ if (!n)
+ return log_oom();
+
+ if (!credential_name_valid(n)) {
+ log_warning("Credential name '%s' is invalid, ignoring.", n);
+ return 0;
+ }
+
+ colon++;
+
+ if (base64) {
+ r = unbase64mem(colon, SIZE_MAX, &binary, &l);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to decode binary credential '%s' data, ignoring: %m", n);
+ return 0;
+ }
+
+ d = binary;
+ } else {
+ d = colon;
+ l = strlen(colon);
+ }
+
+ if (!credential_size_ok(c, n, l))
+ return 0;
+
+ r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, n);
+ if (nfd == -EEXIST)
+ return 0;
+ if (nfd < 0)
+ return nfd;
+
+ r = loop_write(nfd, d, l);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, n, 0);
+ return log_error_errno(r, "Failed to write credential: %m");
+ }
+
+ c->size_sum += l;
+ c->n_credentials++;
+
+ log_debug("Successfully processed kernel command line credential '%s'.", n);
+
+ return 0;
+}
+
+static int import_credentials_proc_cmdline(ImportCredentialContext *c) {
+ int r;
+
+ assert(c);
+
+ r = proc_cmdline_parse(proc_cmdline_callback, c, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse /proc/cmdline: %m");
+
+ return 0;
+}
+
+#define QEMU_FWCFG_PATH "/sys/firmware/qemu_fw_cfg/by_name/opt/io.systemd.credentials"
+
+static int import_credentials_qemu(ImportCredentialContext *c) {
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int source_dir_fd = -EBADF;
+ int r;
+
+ assert(c);
+
+ if (detect_container() > 0) /* don't access /sys/ in a container */
+ return 0;
+
+ if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */
+ return 0;
+
+ source_dir_fd = open(QEMU_FWCFG_PATH, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (source_dir_fd < 0) {
+ if (errno == ENOENT) {
+ log_debug("No credentials passed via fw_cfg.");
+ return 0;
+ }
+
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "', ignoring: %m");
+ return 0;
+ }
+
+ r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "' contents, ignoring: %m");
+ return 0;
+ }
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ const struct dirent *d = de->entries[i];
+ _cleanup_close_ int vfd = -EBADF, rfd = -EBADF, nfd = -EBADF;
+ _cleanup_free_ char *szs = NULL;
+ uint64_t sz;
+
+ if (!credential_name_valid(d->d_name)) {
+ log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+ continue;
+ }
+
+ vfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (vfd < 0) {
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = read_virtual_file_at(vfd, "size", LINE_MAX, &szs, NULL);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '" QEMU_FWCFG_PATH "'/%s/size, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = safe_atou64(strstrip(szs), &sz);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse size of credential '%s', ignoring: %s", d->d_name, szs);
+ continue;
+ }
+
+ if (!credential_size_ok(c, d->d_name, sz))
+ continue;
+
+ /* Ideally we'd just symlink the data here. Alas the kernel driver exports the raw file as
+ * having size zero, and we'd rather not have applications support such credential
+ * files. Let's hence copy the files to make them regular. */
+
+ rfd = openat(vfd, "raw", O_RDONLY|O_CLOEXEC);
+ if (rfd < 0) {
+ log_warning_errno(errno, "Failed to open '" QEMU_FWCFG_PATH "'/%s/raw, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = copy_bytes(rfd, nfd, sz, 0);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, d->d_name, 0);
+ return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name);
+ }
+
+ c->size_sum += sz;
+ c->n_credentials++;
+
+ log_debug("Successfully copied qemu fw_cfg credential '%s'.", d->d_name);
+ }
+
+ return 0;
+}
+
+static int parse_smbios_strings(ImportCredentialContext *c, const char *data, size_t size) {
+ size_t left, skip;
+ const char *p;
+ int r;
+
+ assert(c);
+ assert(data || size == 0);
+
+ /* Unpacks a packed series of SMBIOS OEM vendor strings. These are a series of NUL terminated
+ * strings, one after the other. */
+
+ for (p = data, left = size; left > 0; p += skip, left -= skip) {
+ _cleanup_free_ void *buf = NULL;
+ _cleanup_free_ char *cn = NULL;
+ _cleanup_close_ int nfd = -EBADF;
+ const char *nul, *n, *eq;
+ const void *cdata;
+ size_t buflen, cdata_len;
+ bool unbase64;
+
+ nul = memchr(p, 0, left);
+ if (nul)
+ skip = (nul - p) + 1;
+ else {
+ nul = p + left;
+ skip = left;
+ }
+
+ if (nul - p == 0) /* Skip empty strings */
+ continue;
+
+ /* Only care about strings starting with either of these two prefixes */
+ if ((n = memory_startswith(p, nul - p, "io.systemd.credential:")))
+ unbase64 = false;
+ else if ((n = memory_startswith(p, nul - p, "io.systemd.credential.binary:")))
+ unbase64 = true;
+ else {
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape_length(p, nul - p);
+ log_debug("Ignoring OEM string: %s", strnull(escaped));
+ continue;
+ }
+
+ eq = memchr(n, '=', nul - n);
+ if (!eq) {
+ log_warning("SMBIOS OEM string lacks '=' character, ignoring.");
+ continue;
+ }
+
+ cn = memdup_suffix0(n, eq - n);
+ if (!cn)
+ return log_oom();
+
+ if (!credential_name_valid(cn)) {
+ log_warning("SMBIOS credential name '%s' is not valid, ignoring: %m", cn);
+ continue;
+ }
+
+ /* Optionally base64 decode the data, if requested, to allow binary credentials */
+ if (unbase64) {
+ r = unbase64mem(eq + 1, nul - (eq + 1), &buf, &buflen);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to base64 decode credential '%s', ignoring: %m", cn);
+ continue;
+ }
+
+ cdata = buf;
+ cdata_len = buflen;
+ } else {
+ cdata = eq + 1;
+ cdata_len = nul - (eq + 1);
+ }
+
+ if (!credential_size_ok(c, cn, cdata_len))
+ continue;
+
+ r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, cn);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = loop_write(nfd, cdata, cdata_len);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, cn, 0);
+ return log_error_errno(r, "Failed to write credential: %m");
+ }
+
+ c->size_sum += cdata_len;
+ c->n_credentials++;
+
+ log_debug("Successfully processed SMBIOS credential '%s'.", cn);
+ }
+
+ return 0;
+}
+
+static int import_credentials_smbios(ImportCredentialContext *c) {
+ int r;
+
+ /* Parses DMI OEM strings fields (SMBIOS type 11), as settable with qemu's -smbios type=11,value=… switch. */
+
+ if (detect_container() > 0) /* don't access /sys/ in a container */
+ return 0;
+
+ if (detect_confidential_virtualization() > 0) /* don't trust firmware if confidential VMs */
+ return 0;
+
+ for (unsigned i = 0;; i++) {
+ struct dmi_field_header {
+ uint8_t type;
+ uint8_t length;
+ uint16_t handle;
+ uint8_t count;
+ char contents[];
+ } _packed_ *dmi_field_header;
+ _cleanup_free_ char *p = NULL;
+ _cleanup_free_ void *data = NULL;
+ size_t size;
+
+ assert_cc(offsetof(struct dmi_field_header, contents) == 5);
+
+ if (asprintf(&p, "/sys/firmware/dmi/entries/11-%u/raw", i) < 0)
+ return log_oom();
+
+ r = read_virtual_file(p, sizeof(dmi_field_header) + CREDENTIALS_TOTAL_SIZE_MAX, (char**) &data, &size);
+ if (r < 0) {
+ /* Once we reach ENOENT there are no more DMI Type 11 fields around. */
+ log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to open '%s', ignoring: %m", p);
+ break;
+ }
+
+ if (size < offsetof(struct dmi_field_header, contents))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "DMI field header of '%s' too short.", p);
+
+ dmi_field_header = data;
+ if (dmi_field_header->type != 11 ||
+ dmi_field_header->length != offsetof(struct dmi_field_header, contents))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid DMI field header.");
+
+ r = parse_smbios_strings(c, dmi_field_header->contents, size - offsetof(struct dmi_field_header, contents));
+ if (r < 0)
+ return r;
+
+ if (i == UINT_MAX) /* Prevent overflow */
+ break;
+ }
+
+ return 0;
+}
+
+static int import_credentials_initrd(ImportCredentialContext *c) {
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int source_dir_fd = -EBADF;
+ int r;
+
+ assert(c);
+
+ /* This imports credentials from /run/credentials/@initrd/ into our credentials directory and deletes
+ * the source directory afterwards. This is run once after the initrd → host transition. This is
+ * supposed to establish a well-defined avenue for initrd-based host configurators to pass
+ * credentials into the main system. */
+
+ if (in_initrd())
+ return 0;
+
+ source_dir_fd = open("/run/credentials/@initrd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (source_dir_fd < 0) {
+ if (errno == ENOENT)
+ log_debug_errno(errno, "No credentials passed from initrd.");
+ else
+ log_warning_errno(errno, "Failed to open '/run/credentials/@initrd', ignoring: %m");
+ return 0;
+ }
+
+ r = readdir_all(source_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read '/run/credentials/@initrd' contents, ignoring: %m");
+ return 0;
+ }
+
+ FOREACH_ARRAY(entry, de->entries, de->n_entries) {
+ _cleanup_close_ int cfd = -EBADF, nfd = -EBADF;
+ const struct dirent *d = *entry;
+ struct stat st;
+
+ if (!credential_name_valid(d->d_name)) {
+ log_warning("Credential '%s' has invalid name, ignoring.", d->d_name);
+ continue;
+ }
+
+ cfd = openat(source_dir_fd, d->d_name, O_RDONLY|O_CLOEXEC);
+ if (cfd < 0) {
+ log_warning_errno(errno, "Failed to open %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (fstat(cfd, &st) < 0) {
+ log_warning_errno(errno, "Failed to stat %s, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ r = stat_verify_regular(&st);
+ if (r < 0) {
+ log_warning_errno(r, "Credential file %s is not a regular file, ignoring: %m", d->d_name);
+ continue;
+ }
+
+ if (!credential_size_ok(c, d->d_name, st.st_size))
+ continue;
+
+ r = acquire_credential_directory(c, SYSTEM_CREDENTIALS_DIRECTORY, /* with_mount= */ true);
+ if (r < 0)
+ return r;
+
+ nfd = open_credential_file_for_write(c->target_dir_fd, SYSTEM_CREDENTIALS_DIRECTORY, d->d_name);
+ if (nfd == -EEXIST)
+ continue;
+ if (nfd < 0)
+ return nfd;
+
+ r = copy_bytes(cfd, nfd, st.st_size, 0);
+ if (r < 0) {
+ (void) unlinkat(c->target_dir_fd, d->d_name, 0);
+ return log_error_errno(r, "Failed to create credential '%s': %m", d->d_name);
+ }
+
+ c->size_sum += st.st_size;
+ c->n_credentials++;
+
+ log_debug("Successfully copied initrd credential '%s'.", d->d_name);
+
+ (void) unlinkat(source_dir_fd, d->d_name, 0);
+ }
+
+ source_dir_fd = safe_close(source_dir_fd);
+
+ if (rmdir("/run/credentials/@initrd") < 0)
+ log_warning_errno(errno, "Failed to remove /run/credentials/@initrd after import, ignoring: %m");
+
+ return 0;
+}
+
+static int import_credentials_trusted(void) {
+ _cleanup_(import_credentials_context_free) ImportCredentialContext c = {
+ .target_dir_fd = -EBADF,
+ };
+ int q, w, r, y;
+
+ /* This is invoked during early boot when no credentials have been imported so far. (Specifically, if
+ * the $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY environment variables are not set
+ * yet.) */
+
+ r = import_credentials_qemu(&c);
+ w = import_credentials_smbios(&c);
+ q = import_credentials_proc_cmdline(&c);
+ y = import_credentials_initrd(&c);
+
+ if (c.n_credentials > 0) {
+ int z;
+
+ log_debug("Imported %u credentials from kernel command line/smbios/fw_cfg/initrd.", c.n_credentials);
+
+ z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY");
+ if (z < 0)
+ return z;
+ }
+
+ return r < 0 ? r : w < 0 ? w : q < 0 ? q : y;
+}
+
+static int merge_credentials_trusted(const char *creds_dir) {
+ _cleanup_(import_credentials_context_free) ImportCredentialContext c = {
+ .target_dir_fd = -EBADF,
+ };
+ int r;
+
+ /* This is invoked after the initrd → host transitions, when credentials already have been imported,
+ * but we might want to import some more from the initrd. */
+
+ if (in_initrd())
+ return 0;
+
+ /* Do not try to merge initrd credentials into foreign credentials directories */
+ if (!path_equal_ptr(creds_dir, SYSTEM_CREDENTIALS_DIRECTORY)) {
+ log_debug("Not importing initrd credentials, as foreign $CREDENTIALS_DIRECTORY has been set.");
+ return 0;
+ }
+
+ r = import_credentials_initrd(&c);
+
+ if (c.n_credentials > 0) {
+ int z;
+
+ log_debug("Merged %u credentials from initrd.", c.n_credentials);
+
+ z = finalize_credentials_dir(SYSTEM_CREDENTIALS_DIRECTORY, "CREDENTIALS_DIRECTORY");
+ if (z < 0)
+ return z;
+ }
+
+ return r;
+}
+
+static int symlink_credential_dir(const char *envvar, const char *path, const char *where) {
+ int r;
+
+ assert(envvar);
+ assert(path);
+ assert(where);
+
+ if (!path_is_valid(path) || !path_is_absolute(path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "String specified via $%s is not a valid absolute path, refusing: %s", envvar, path);
+
+ /* If the env var already points to where we intend to create the symlink, then most likely we
+ * already imported some creds earlier, and thus set the env var, and hence don't need to do
+ * anything. */
+ if (path_equal(path, where))
+ return 0;
+
+ r = symlink_idempotent(path, where, /* make_relative= */ true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to link $%s to %s: %m", envvar, where);
+
+ return 0;
+}
+
+static int setenv_notify_socket(void) {
+ _cleanup_free_ char *address = NULL;
+ int r;
+
+ r = read_credential_with_decryption("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m");
+
+ if (isempty(address))
+ return 0;
+
+ if (setenv("NOTIFY_SOCKET", address, /* replace= */ 1) < 0)
+ return log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m");
+
+ return 1;
+}
+
+static int report_credentials_per_func(const char *title, int (*get_directory_func)(const char **ret)) {
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ _cleanup_close_ int dir_fd = -EBADF;
+ _cleanup_free_ char *ll = NULL;
+ const char *d = NULL;
+ int r, c = 0;
+
+ assert(title);
+ assert(get_directory_func);
+
+ r = get_directory_func(&d);
+ if (r < 0) {
+ if (r == -ENXIO) /* Env var not set */
+ return 0;
+
+ return log_warning_errno(r, "Failed to determine %s directory: %m", title);
+ }
+
+ dir_fd = open(d, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (dir_fd < 0)
+ return log_warning_errno(errno, "Failed to open credentials directory %s: %m", d);
+
+ r = readdir_all(dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enumerate credentials directory %s: %m", d);
+
+ FOREACH_ARRAY(entry, de->entries, de->n_entries) {
+ const struct dirent *e = *entry;
+
+ if (!credential_name_valid(e->d_name))
+ continue;
+
+ if (!strextend_with_separator(&ll, ", ", e->d_name))
+ return log_oom();
+
+ c++;
+ }
+
+ if (ll)
+ log_info("Received %s: %s", title, ll);
+
+ return c;
+}
+
+static void report_credentials(void) {
+ int p, q;
+
+ p = report_credentials_per_func("regular credentials", get_credentials_dir);
+ q = report_credentials_per_func("untrusted credentials", get_encrypted_credentials_dir);
+
+ log_full(p > 0 || q > 0 ? LOG_INFO : LOG_DEBUG,
+ "Acquired %i regular credentials, %i untrusted credentials.",
+ p > 0 ? p : 0,
+ q > 0 ? q : 0);
+}
+
+int import_credentials(void) {
+ const char *received_creds_dir = NULL, *received_encrypted_creds_dir = NULL;
+ bool envvar_set = false;
+ int r, q;
+
+ r = get_credentials_dir(&received_creds_dir);
+ if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+ log_warning_errno(r, "Failed to determine credentials directory, ignoring: %m");
+
+ envvar_set = r >= 0;
+
+ r = get_encrypted_credentials_dir(&received_encrypted_creds_dir);
+ if (r < 0 && r != -ENXIO) /* ENXIO → env var not set yet */
+ log_warning_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+
+ envvar_set = envvar_set || r >= 0;
+
+ if (envvar_set) {
+ /* Maybe an earlier stage initrd already set this up? If so, don't try to import anything again. */
+ log_debug("Not importing credentials, $CREDENTIALS_DIRECTORY or $ENCRYPTED_CREDENTIALS_DIRECTORY already set.");
+
+ /* But, let's make sure the creds are available from our regular paths. */
+ if (received_creds_dir)
+ r = symlink_credential_dir("CREDENTIALS_DIRECTORY", received_creds_dir, SYSTEM_CREDENTIALS_DIRECTORY);
+ else
+ r = 0;
+
+ if (received_encrypted_creds_dir) {
+ q = symlink_credential_dir("ENCRYPTED_CREDENTIALS_DIRECTORY", received_encrypted_creds_dir, ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY);
+ if (r >= 0)
+ r = q;
+ }
+
+ q = merge_credentials_trusted(received_creds_dir);
+ if (r >= 0)
+ r = q;
+
+ } else {
+ _cleanup_free_ char *v = NULL;
+
+ r = proc_cmdline_get_key("systemd.import_credentials", PROC_CMDLINE_STRIP_RD_PREFIX, &v);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check if 'systemd.import_credentials=' kernel command line option is set, ignoring: %m");
+ else if (r > 0) {
+ r = parse_boolean(v);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse 'systemd.import_credentials=' parameter, ignoring: %m");
+ else if (r == 0) {
+ log_notice("systemd.import_credentials=no is set, skipping importing of credentials.");
+ return 0;
+ }
+ }
+
+ r = import_credentials_boot();
+
+ q = import_credentials_trusted();
+ if (r >= 0)
+ r = q;
+ }
+
+ report_credentials();
+
+ /* Propagate vmm_notify_socket credential → $NOTIFY_SOCKET env var */
+ (void) setenv_notify_socket();
+
+ return r;
+}
diff --git a/src/core/import-creds.h b/src/core/import-creds.h
new file mode 100644
index 0000000..a87865c
--- /dev/null
+++ b/src/core/import-creds.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int import_credentials(void);
diff --git a/src/core/job.c b/src/core/job.c
new file mode 100644
index 0000000..e78c2a7
--- /dev/null
+++ b/src/core/job.c
@@ -0,0 +1,1712 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "cgroup.h"
+#include "dbus-job.h"
+#include "dbus.h"
+#include "escape.h"
+#include "fileio.h"
+#include "job.h"
+#include "log.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "set.h"
+#include "sort-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unit.h"
+#include "virt.h"
+
+Job* job_new_raw(Unit *unit) {
+ Job *j;
+
+ /* used for deserialization */
+
+ assert(unit);
+
+ j = new(Job, 1);
+ if (!j)
+ return NULL;
+
+ *j = (Job) {
+ .manager = unit->manager,
+ .unit = unit,
+ .type = _JOB_TYPE_INVALID,
+ };
+
+ return j;
+}
+
+static uint32_t manager_get_new_job_id(Manager *m) {
+ bool overflow = false;
+
+ assert(m);
+
+ for (;;) {
+ uint32_t id = m->current_job_id;
+
+ if (_unlikely_(id == UINT32_MAX)) {
+ assert_se(!overflow);
+ m->current_job_id = 1;
+ overflow = true;
+ } else
+ m->current_job_id++;
+
+ if (hashmap_contains(m->jobs, UINT32_TO_PTR(id)))
+ continue;
+
+ return id;
+ }
+}
+
+Job* job_new(Unit *unit, JobType type) {
+ Job *j;
+
+ assert(type < _JOB_TYPE_MAX);
+
+ j = job_new_raw(unit);
+ if (!j)
+ return NULL;
+
+ j->id = manager_get_new_job_id(j->manager);
+ j->type = type;
+
+ /* We don't link it here, that's what job_dependency() is for */
+
+ return j;
+}
+
+void job_unlink(Job *j) {
+ assert(j);
+ assert(!j->installed);
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+ assert(!j->subject_list);
+ assert(!j->object_list);
+
+ if (j->in_run_queue) {
+ prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+ j->in_run_queue = false;
+ }
+
+ if (j->in_dbus_queue) {
+ LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = false;
+ }
+
+ if (j->in_gc_queue) {
+ LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
+ j->in_gc_queue = false;
+ }
+
+ j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+}
+
+Job* job_free(Job *j) {
+ assert(j);
+ assert(!j->installed);
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+ assert(!j->subject_list);
+ assert(!j->object_list);
+
+ job_unlink(j);
+
+ sd_bus_track_unref(j->bus_track);
+ strv_free(j->deserialized_clients);
+
+ activation_details_unref(j->activation_details);
+
+ return mfree(j);
+}
+
+static void job_set_state(Job *j, JobState state) {
+ assert(j);
+ assert(state >= 0);
+ assert(state < _JOB_STATE_MAX);
+
+ if (j->state == state)
+ return;
+
+ j->state = state;
+
+ if (!j->installed)
+ return;
+
+ if (j->state == JOB_RUNNING)
+ j->unit->manager->n_running_jobs++;
+ else {
+ assert(j->state == JOB_WAITING);
+ assert(j->unit->manager->n_running_jobs > 0);
+
+ j->unit->manager->n_running_jobs--;
+
+ if (j->unit->manager->n_running_jobs <= 0)
+ j->unit->manager->jobs_in_progress_event_source = sd_event_source_disable_unref(j->unit->manager->jobs_in_progress_event_source);
+ }
+}
+
+void job_uninstall(Job *j) {
+ Job **pj;
+
+ assert(j->installed);
+
+ job_set_state(j, JOB_WAITING);
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ assert(*pj == j);
+
+ /* Detach from next 'bigger' objects */
+
+ /* daemon-reload should be transparent to job observers */
+ if (!MANAGER_IS_RELOADING(j->manager))
+ bus_job_send_removed_signal(j);
+
+ *pj = NULL;
+
+ unit_add_to_gc_queue(j->unit);
+
+ unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+ hashmap_remove_value(j->manager->jobs, UINT32_TO_PTR(j->id), j);
+ j->installed = false;
+}
+
+static bool job_type_allows_late_merge(JobType t) {
+ /* Tells whether it is OK to merge a job of type 't' with an already
+ * running job.
+ * Reloads cannot be merged this way. Think of the sequence:
+ * 1. Reload of a daemon is in progress; the daemon has already loaded
+ * its config file, but hasn't completed the reload operation yet.
+ * 2. Edit foo's config file.
+ * 3. Trigger another reload to have the daemon use the new config.
+ * Should the second reload job be merged into the first one, the daemon
+ * would not know about the new config.
+ * JOB_RESTART jobs on the other hand can be merged, because they get
+ * patched into JOB_START after stopping the unit. So if we see a
+ * JOB_RESTART running, it means the unit hasn't stopped yet and at
+ * this time the merge is still allowed. */
+ return t != JOB_RELOAD;
+}
+
+static void job_merge_into_installed(Job *j, Job *other) {
+ assert(j->installed);
+ assert(j->unit == other->unit);
+
+ if (j->type != JOB_NOP) {
+ assert_se(job_type_merge_and_collapse(&j->type, other->type, j->unit) == 0);
+
+ /* Keep the oldest ActivationDetails, if any */
+ if (!j->activation_details)
+ j->activation_details = TAKE_PTR(other->activation_details);
+ } else
+ assert(other->type == JOB_NOP);
+
+ j->irreversible = j->irreversible || other->irreversible;
+ j->ignore_order = j->ignore_order || other->ignore_order;
+}
+
+Job* job_install(Job *j, bool refuse_late_merge) {
+ Job **pj;
+ Job *uj;
+
+ assert(j);
+ assert(!j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(j->state == JOB_WAITING);
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ uj = *pj;
+
+ if (uj) {
+ if (job_type_is_conflicting(uj->type, j->type))
+ job_finish_and_invalidate(uj, JOB_CANCELED, false, false);
+ else {
+ /* not conflicting, i.e. mergeable */
+
+ if (uj->state == JOB_WAITING ||
+ (!refuse_late_merge && job_type_allows_late_merge(j->type) && job_type_is_superset(uj->type, j->type))) {
+ job_merge_into_installed(uj, j);
+ log_unit_debug(uj->unit,
+ "Merged %s/%s into installed job %s/%s as %"PRIu32,
+ j->unit->id, job_type_to_string(j->type), uj->unit->id,
+ job_type_to_string(uj->type), uj->id);
+ return uj;
+ } else {
+ /* already running and not safe to merge into */
+ /* Patch uj to become a merged job and re-run it. */
+ /* XXX It should be safer to queue j to run after uj finishes, but it is
+ * not currently possible to have more than one installed job per unit. */
+ job_merge_into_installed(uj, j);
+ log_unit_debug(uj->unit,
+ "Merged into running job, re-running: %s/%s as %"PRIu32,
+ uj->unit->id, job_type_to_string(uj->type), uj->id);
+
+ job_set_state(uj, JOB_WAITING);
+ return uj;
+ }
+ }
+ }
+
+ /* Install the job */
+ assert(!*pj);
+ *pj = j;
+ j->installed = true;
+
+ j->manager->n_installed_jobs++;
+ log_unit_debug(j->unit,
+ "Installed new job %s/%s as %u",
+ j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+
+ job_add_to_gc_queue(j);
+
+ job_add_to_dbus_queue(j); /* announce this job to clients */
+ unit_add_to_dbus_queue(j->unit); /* The Job property of the unit has changed now */
+
+ return j;
+}
+
+int job_install_deserialized(Job *j) {
+ Job **pj;
+ int r;
+
+ assert(!j->installed);
+
+ if (j->type < 0 || j->type >= _JOB_TYPE_MAX_IN_TRANSACTION)
+ return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EINVAL),
+ "Invalid job type %s in deserialization.",
+ strna(job_type_to_string(j->type)));
+
+ pj = j->type == JOB_NOP ? &j->unit->nop_job : &j->unit->job;
+ if (*pj)
+ return log_unit_debug_errno(j->unit, SYNTHETIC_ERRNO(EEXIST),
+ "Unit already has a job installed. Not installing deserialized job.");
+
+ /* When the job does not have ID, or we failed to deserialize the job ID, then use a new ID. */
+ if (j->id <= 0)
+ j->id = manager_get_new_job_id(j->manager);
+
+ r = hashmap_ensure_put(&j->manager->jobs, NULL, UINT32_TO_PTR(j->id), j);
+ if (r == -EEXIST)
+ return log_unit_debug_errno(j->unit, r, "Job ID %" PRIu32 " already used, cannot deserialize job.", j->id);
+ if (r < 0)
+ return log_unit_debug_errno(j->unit, r, "Failed to insert job into jobs hash table: %m");
+
+ *pj = j;
+ j->installed = true;
+
+ if (j->state == JOB_RUNNING)
+ j->unit->manager->n_running_jobs++;
+
+ log_unit_debug(j->unit,
+ "Reinstalled deserialized job %s/%s as %u",
+ j->unit->id, job_type_to_string(j->type), (unsigned) j->id);
+ return 0;
+}
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts) {
+ JobDependency *l;
+
+ assert(object);
+
+ /* Adds a new job link, which encodes that the 'subject' job
+ * needs the 'object' job in some way. If 'subject' is NULL
+ * this means the 'anchor' job (i.e. the one the user
+ * explicitly asked for) is the requester. */
+
+ l = new0(JobDependency, 1);
+ if (!l)
+ return NULL;
+
+ l->subject = subject;
+ l->object = object;
+ l->matters = matters;
+ l->conflicts = conflicts;
+
+ if (subject)
+ LIST_PREPEND(subject, subject->subject_list, l);
+
+ LIST_PREPEND(object, object->object_list, l);
+
+ return l;
+}
+
+void job_dependency_free(JobDependency *l) {
+ assert(l);
+
+ if (l->subject)
+ LIST_REMOVE(subject, l->subject->subject_list, l);
+
+ LIST_REMOVE(object, l->object->object_list, l);
+
+ free(l);
+}
+
+void job_dump(Job *j, FILE *f, const char *prefix) {
+ assert(j);
+ assert(f);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%s-> Job %u:\n"
+ "%s\tAction: %s -> %s\n"
+ "%s\tState: %s\n"
+ "%s\tIrreversible: %s\n"
+ "%s\tMay GC: %s\n",
+ prefix, j->id,
+ prefix, j->unit->id, job_type_to_string(j->type),
+ prefix, job_state_to_string(j->state),
+ prefix, yes_no(j->irreversible),
+ prefix, yes_no(job_may_gc(j)));
+}
+
+/*
+ * Merging is commutative, so imagine the matrix as symmetric. We store only
+ * its lower triangle to avoid duplication. We don't store the main diagonal,
+ * because A merged with A is simply A.
+ *
+ * If the resulting type is collapsed immediately afterwards (to get rid of
+ * the JOB_RELOAD_OR_START, which lies outside the lookup function's domain),
+ * the following properties hold:
+ *
+ * Merging is associative! A merged with B, and then merged with C is the same
+ * as A merged with the result of B merged with C.
+ *
+ * Mergeability is transitive! If A can be merged with B and B with C then
+ * A also with C.
+ *
+ * Also, if A merged with B cannot be merged with C, then either A or B cannot
+ * be merged with C either.
+ */
+static const JobType job_merging_table[] = {
+/* What \ With * JOB_START JOB_VERIFY_ACTIVE JOB_STOP JOB_RELOAD */
+/*********************************************************************************/
+/*JOB_START */
+/*JOB_VERIFY_ACTIVE */ JOB_START,
+/*JOB_STOP */ -1, -1,
+/*JOB_RELOAD */ JOB_RELOAD_OR_START, JOB_RELOAD, -1,
+/*JOB_RESTART */ JOB_RESTART, JOB_RESTART, -1, JOB_RESTART,
+};
+
+JobType job_type_lookup_merge(JobType a, JobType b) {
+ assert_cc(ELEMENTSOF(job_merging_table) == _JOB_TYPE_MAX_MERGING * (_JOB_TYPE_MAX_MERGING - 1) / 2);
+ assert(a >= 0 && a < _JOB_TYPE_MAX_MERGING);
+ assert(b >= 0 && b < _JOB_TYPE_MAX_MERGING);
+
+ if (a == b)
+ return a;
+
+ if (a < b) {
+ JobType tmp = a;
+ a = b;
+ b = tmp;
+ }
+
+ return job_merging_table[(a - 1) * a / 2 + b];
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) {
+ switch (a) {
+
+ case JOB_START:
+ return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+ case JOB_STOP:
+ return IN_SET(b, UNIT_INACTIVE, UNIT_FAILED);
+
+ case JOB_VERIFY_ACTIVE:
+ return IN_SET(b, UNIT_ACTIVE, UNIT_RELOADING);
+
+ case JOB_RELOAD:
+ return
+ b == UNIT_RELOADING;
+
+ case JOB_RESTART:
+ /* Restart jobs must always be kept.
+ *
+ * For ACTIVE/RELOADING units, this is obvious.
+ *
+ * For ACTIVATING units, it's more subtle:
+ *
+ * Generally, if a service Requires= another unit, restarts of
+ * the unit must be propagated to the service. If the service is
+ * ACTIVATING, it must still be restarted since it might have
+ * stale information regarding the other unit.
+ *
+ * For example, consider a service that Requires= a socket: if
+ * the socket is restarted, but the service is still ACTIVATING,
+ * it's necessary to restart the service so that it gets the new
+ * socket. */
+ return false;
+
+ case JOB_NOP:
+ return true;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+JobType job_type_collapse(JobType t, Unit *u) {
+ UnitActiveState s;
+
+ switch (t) {
+
+ case JOB_TRY_RESTART:
+ /* Be sure to keep the restart job even if the unit is
+ * ACTIVATING.
+ *
+ * See the job_type_is_redundant(JOB_RESTART) for more info */
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_ACTIVATING(s))
+ return JOB_NOP;
+
+ return JOB_RESTART;
+
+ case JOB_TRY_RELOAD:
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+ return JOB_NOP;
+
+ return JOB_RELOAD;
+
+ case JOB_RELOAD_OR_START:
+ s = unit_active_state(u);
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(s))
+ return JOB_START;
+
+ return JOB_RELOAD;
+
+ default:
+ return t;
+ }
+}
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u) {
+ JobType t;
+
+ t = job_type_lookup_merge(*a, b);
+ if (t < 0)
+ return -EEXIST;
+
+ *a = job_type_collapse(t, u);
+ return 0;
+}
+
+static bool job_is_runnable(Job *j) {
+ Unit *other;
+
+ assert(j);
+ assert(j->installed);
+
+ /* Checks whether there is any job running for the units this
+ * job needs to be running after (in the case of a 'positive'
+ * job type) or before (in the case of a 'negative' job
+ * type. */
+
+ /* Note that unit types have a say in what is runnable,
+ * too. For example, if they return -EAGAIN from
+ * unit_start() they can indicate they are not
+ * runnable yet. */
+
+ /* First check if there is an override */
+ if (j->ignore_order)
+ return true;
+
+ if (j->type == JOB_NOP)
+ return true;
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) > 0) {
+ log_unit_debug(j->unit,
+ "starting held back, waiting for: %s",
+ other->id);
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) > 0) {
+ log_unit_debug(j->unit,
+ "stopping held back, waiting for: %s",
+ other->id);
+ return false;
+ }
+
+ return true;
+}
+
+static void job_change_type(Job *j, JobType newtype) {
+ assert(j);
+
+ log_unit_debug(j->unit,
+ "Converting job %s/%s -> %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ j->unit->id, job_type_to_string(newtype));
+
+ j->type = newtype;
+}
+
+static const char* job_start_message_format(Unit *u, JobType t) {
+ assert(u);
+ assert(IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD));
+
+ if (t == JOB_RELOAD)
+ return "Reloading %s...";
+ else if (t == JOB_START)
+ return UNIT_VTABLE(u)->status_message_formats.starting_stopping[0] ?: "Starting %s...";
+ else
+ return UNIT_VTABLE(u)->status_message_formats.starting_stopping[1] ?: "Stopping %s...";
+}
+
+static void job_emit_start_message(Unit *u, uint32_t job_id, JobType t) {
+ _cleanup_free_ char *free_ident = NULL;
+ const char *ident, *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+ assert(u->id); /* We better don't try to run a unit that doesn't even have an id. */
+
+ if (!IN_SET(t, JOB_START, JOB_STOP, JOB_RELOAD))
+ return;
+
+ if (!unit_log_level_test(u, LOG_INFO))
+ return;
+
+ format = job_start_message_format(u, t);
+ ident = unit_status_string(u, &free_ident);
+
+ bool do_console = t != JOB_RELOAD;
+ bool console_only = do_console && log_on_console(); /* Reload status messages have traditionally
+ * not been printed to the console. */
+
+ /* Print to the log first. */
+ if (!console_only) { /* Skip this if it would only go on the console anyway */
+
+ const char *mid =
+ t == JOB_START ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTING_STR :
+ t == JOB_STOP ? "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPING_STR :
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADING_STR;
+ const char *msg_fmt = strjoina("MESSAGE=", format);
+
+ /* Note that we deliberately use LOG_MESSAGE() instead of LOG_UNIT_MESSAGE() here, since this
+ * is supposed to mimic closely what is written to screen using the status output, which is
+ * supposed to be high level friendly output. */
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_unit_struct(u, LOG_INFO,
+ msg_fmt, ident,
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ REENABLE_WARNING;
+ }
+
+ /* Log to the console second. */
+ if (do_console) {
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ unit_status_printf(u, STATUS_TYPE_NORMAL, "", format, ident);
+ REENABLE_WARNING;
+ }
+}
+
+static const char* job_done_message_format(Unit *u, JobType t, JobResult result) {
+ static const char* const generic_finished_start_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Started %s.",
+ [JOB_TIMEOUT] = "Timed out starting %s.",
+ [JOB_FAILED] = "Failed to start %s.",
+ [JOB_DEPENDENCY] = "Dependency failed for %s.",
+ [JOB_ASSERT] = "Assertion failed for %s.",
+ [JOB_UNSUPPORTED] = "Starting of %s unsupported.",
+ [JOB_COLLECTED] = "Unnecessary job was removed for %s.",
+ [JOB_ONCE] = "Unit %s has been started before and cannot be started again.",
+ };
+ static const char* const generic_finished_stop_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Stopped %s.",
+ [JOB_FAILED] = "Stopped %s with error.",
+ [JOB_TIMEOUT] = "Timed out stopping %s.",
+ };
+ static const char* const generic_finished_reload_job[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "Reloaded %s.",
+ [JOB_FAILED] = "Reload failed for %s.",
+ [JOB_TIMEOUT] = "Timed out reloading %s.",
+ };
+ /* When verify-active detects the unit is inactive, report it.
+ * Most likely a DEPEND warning from a requisiting unit will
+ * occur next and it's nice to see what was requisited. */
+ static const char* const generic_finished_verify_active_job[_JOB_RESULT_MAX] = {
+ [JOB_SKIPPED] = "%s is inactive.",
+ };
+ const char *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ /* Show condition check message if the job did not actually do anything due to unmet condition. */
+ if (t == JOB_START && result == JOB_DONE && !u->condition_result)
+ return "Condition check resulted in %s being skipped.";
+
+ if (IN_SET(t, JOB_START, JOB_STOP, JOB_RESTART)) {
+ const UnitStatusMessageFormats *formats = &UNIT_VTABLE(u)->status_message_formats;
+ if (formats->finished_job) {
+ format = formats->finished_job(u, t, result);
+ if (format)
+ return format;
+ }
+
+ format = (t == JOB_START ? formats->finished_start_job : formats->finished_stop_job)[result];
+ if (format)
+ return format;
+ }
+
+ /* Return generic strings */
+ switch (t) {
+ case JOB_START:
+ return generic_finished_start_job[result];
+ case JOB_STOP:
+ case JOB_RESTART:
+ return generic_finished_stop_job[result];
+ case JOB_RELOAD:
+ return generic_finished_reload_job[result];
+ case JOB_VERIFY_ACTIVE:
+ return generic_finished_verify_active_job[result];
+ default:
+ return NULL;
+ }
+}
+
+static const struct {
+ int log_level;
+ const char *color, *word;
+} job_done_messages[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = { LOG_INFO, ANSI_OK_COLOR, " OK " },
+ [JOB_CANCELED] = { LOG_INFO, },
+ [JOB_TIMEOUT] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " TIME " },
+ [JOB_FAILED] = { LOG_ERR, ANSI_HIGHLIGHT_RED, "FAILED" },
+ [JOB_DEPENDENCY] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "DEPEND" },
+ [JOB_SKIPPED] = { LOG_NOTICE, ANSI_HIGHLIGHT, " INFO " },
+ [JOB_INVALID] = { LOG_INFO, },
+ [JOB_ASSERT] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "ASSERT" },
+ [JOB_UNSUPPORTED] = { LOG_WARNING, ANSI_HIGHLIGHT_YELLOW, "UNSUPP" },
+ [JOB_COLLECTED] = { LOG_INFO, },
+ [JOB_ONCE] = { LOG_ERR, ANSI_HIGHLIGHT_RED, " ONCE " },
+};
+
+static const char* job_done_mid(JobType type, JobResult result) {
+ switch (type) {
+ case JOB_START:
+ if (result == JOB_DONE)
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_STARTED_STR;
+ else
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILED_STR;
+
+ case JOB_RELOAD:
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_RELOADED_STR;
+
+ case JOB_STOP:
+ case JOB_RESTART:
+ return "MESSAGE_ID=" SD_MESSAGE_UNIT_STOPPED_STR;
+
+ default:
+ return NULL;
+ }
+}
+
+static void job_emit_done_message(Unit *u, uint32_t job_id, JobType t, JobResult result) {
+ _cleanup_free_ char *free_ident = NULL;
+ const char *ident, *format;
+
+ assert(u);
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ if (!unit_log_level_test(u, job_done_messages[result].log_level))
+ return;
+
+ format = job_done_message_format(u, t, result);
+ if (!format)
+ return;
+
+ ident = unit_status_string(u, &free_ident);
+
+ const char *status = job_done_messages[result].word;
+ bool do_console = t != JOB_RELOAD && status;
+ bool console_only = do_console && log_on_console();
+
+ if (t == JOB_START && result == JOB_DONE && !u->condition_result) {
+ /* No message on the console if the job did not actually do anything due to unmet condition. */
+ if (console_only)
+ return;
+ else
+ do_console = false;
+ }
+
+ if (!console_only) { /* Skip printing if output goes to the console, and job_print_status_message()
+ * will actually print something to the console. */
+ Condition *c;
+ const char *mid = job_done_mid(t, result); /* mid may be NULL. log_unit_struct() will ignore it. */
+
+ c = t == JOB_START && result == JOB_DONE ? unit_find_failed_condition(u) : NULL;
+ if (c) {
+ /* Special case units that were skipped because of a unmet condition check so that
+ * we can add more information to the message. */
+ if (c->trigger)
+ log_unit_struct(
+ u,
+ job_done_messages[result].log_level,
+ LOG_MESSAGE("%s was skipped because no trigger condition checks were met.",
+ ident),
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ else
+ log_unit_struct(
+ u,
+ job_done_messages[result].log_level,
+ LOG_MESSAGE("%s was skipped because of an unmet condition check (%s=%s%s).",
+ ident,
+ condition_type_to_string(c->type),
+ c->negate ? "!" : "",
+ c->parameter),
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ } else {
+ const char *msg_fmt = strjoina("MESSAGE=", format);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_unit_struct(u, job_done_messages[result].log_level,
+ msg_fmt, ident,
+ "JOB_ID=%" PRIu32, job_id,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_INVOCATION_ID(u),
+ mid);
+ REENABLE_WARNING;
+ }
+ }
+
+ if (do_console) {
+ if (log_get_show_color())
+ status = strjoina(job_done_messages[result].color,
+ status,
+ ANSI_NORMAL);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ unit_status_printf(u,
+ result == JOB_DONE ? STATUS_TYPE_NORMAL : STATUS_TYPE_NOTICE,
+ status, format, ident);
+ REENABLE_WARNING;
+
+ if (t == JOB_START && result == JOB_FAILED) {
+ _cleanup_free_ char *quoted = NULL;
+
+ quoted = shell_maybe_quote(u->id, 0);
+ if (quoted)
+ manager_status_printf(u->manager, STATUS_TYPE_NORMAL, NULL,
+ "See 'systemctl status %s' for details.", quoted);
+ }
+ }
+}
+
+static int job_perform_on_unit(Job **j) {
+ ActivationDetails *a;
+ uint32_t id;
+ Manager *m;
+ JobType t;
+ Unit *u;
+ bool wait_only;
+ int r;
+
+ /* While we execute this operation the job might go away (for example: because it finishes immediately
+ * or is replaced by a new, conflicting job). To make sure we don't access a freed job later on we
+ * store the id here, so that we can verify the job is still valid. */
+
+ assert(j);
+ assert(*j);
+
+ m = (*j)->manager;
+ u = (*j)->unit;
+ t = (*j)->type;
+ id = (*j)->id;
+ a = (*j)->activation_details;
+
+ switch (t) {
+ case JOB_START:
+ r = unit_start(u, a);
+ wait_only = r == -EBADR; /* If the unit type does not support starting, then simply wait. */
+ break;
+
+ case JOB_RESTART:
+ t = JOB_STOP;
+ _fallthrough_;
+ case JOB_STOP:
+ r = unit_stop(u);
+ wait_only = r == -EBADR; /* If the unit type does not support stopping, then simply wait. */
+ break;
+
+ case JOB_RELOAD:
+ r = unit_reload(u);
+ wait_only = false; /* A clear error is generated if reload is not supported. */
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Log if the job still exists and the start/stop/reload function actually did something or we're
+ * only waiting for unit status change (common for device units). The latter ensures that job start
+ * messages for device units are correctly shown. Note that if the job disappears too quickly, e.g.
+ * for units for which there's no 'activating' phase (i.e. because we transition directly from
+ * 'inactive' to 'active'), we'll possibly skip the "Starting..." message. */
+ *j = manager_get_job(m, id);
+ if (*j && (r > 0 || wait_only))
+ job_emit_start_message(u, id, t);
+
+ return wait_only ? 0 : r;
+}
+
+int job_run_and_invalidate(Job *j) {
+ int r;
+
+ assert(j);
+ assert(j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(j->in_run_queue);
+
+ prioq_remove(j->manager->run_queue, j, &j->run_queue_idx);
+ j->in_run_queue = false;
+
+ if (j->state != JOB_WAITING)
+ return 0;
+
+ if (!job_is_runnable(j))
+ return -EAGAIN;
+
+ job_start_timer(j, true);
+ job_set_state(j, JOB_RUNNING);
+ job_add_to_dbus_queue(j);
+
+ switch (j->type) {
+
+ case JOB_VERIFY_ACTIVE: {
+ UnitActiveState t;
+
+ t = unit_active_state(j->unit);
+ if (UNIT_IS_ACTIVE_OR_RELOADING(t))
+ r = -EALREADY;
+ else if (t == UNIT_ACTIVATING)
+ r = -EAGAIN;
+ else
+ r = -EBADR;
+ break;
+ }
+
+ case JOB_START:
+ case JOB_STOP:
+ case JOB_RESTART:
+ case JOB_RELOAD:
+ r = job_perform_on_unit(&j);
+ break;
+
+ case JOB_NOP:
+ r = -EALREADY;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (j) {
+ if (r == -EAGAIN)
+ job_set_state(j, JOB_WAITING); /* Hmm, not ready after all, let's return to JOB_WAITING state */
+ else if (r == -EALREADY) /* already being executed */
+ r = job_finish_and_invalidate(j, JOB_DONE, true, true);
+ else if (r == -ECOMM)
+ r = job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (r == -EBADR)
+ r = job_finish_and_invalidate(j, JOB_SKIPPED, true, false);
+ else if (r == -ENOEXEC)
+ r = job_finish_and_invalidate(j, JOB_INVALID, true, false);
+ else if (r == -EPROTO)
+ r = job_finish_and_invalidate(j, JOB_ASSERT, true, false);
+ else if (r == -EOPNOTSUPP)
+ r = job_finish_and_invalidate(j, JOB_UNSUPPORTED, true, false);
+ else if (r == -ENOLINK)
+ r = job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+ else if (r == -ESTALE)
+ r = job_finish_and_invalidate(j, JOB_ONCE, true, false);
+ else if (r < 0)
+ r = job_finish_and_invalidate(j, JOB_FAILED, true, false);
+ }
+
+ return r;
+}
+
+static void job_fail_dependencies(Unit *u, UnitDependencyAtom match_atom) {
+ Unit *other;
+
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, match_atom) {
+ Job *j = other->job;
+
+ if (!j)
+ continue;
+ if (!IN_SET(j->type, JOB_START, JOB_VERIFY_ACTIVE))
+ continue;
+
+ job_finish_and_invalidate(j, JOB_DEPENDENCY, true, false);
+ }
+}
+
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
+ Unit *u, *other;
+ JobType t;
+
+ assert(j);
+ assert(j->installed);
+ assert(j->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+
+ u = j->unit;
+ t = j->type;
+
+ j->result = result;
+
+ log_unit_debug(u, "Job %" PRIu32 " %s/%s finished, result=%s",
+ j->id, u->id, job_type_to_string(t), job_result_to_string(result));
+
+ /* If this job did nothing to the respective unit we don't log the status message */
+ if (!already)
+ job_emit_done_message(u, j->id, t, result);
+
+ /* Patch restart jobs so that they become normal start jobs */
+ if (result == JOB_DONE && t == JOB_RESTART) {
+
+ job_change_type(j, JOB_START);
+ job_set_state(j, JOB_WAITING);
+
+ job_add_to_dbus_queue(j);
+ job_add_to_run_queue(j);
+ job_add_to_gc_queue(j);
+
+ goto finish;
+ }
+
+ if (IN_SET(result, JOB_FAILED, JOB_INVALID))
+ j->manager->n_failed_jobs++;
+
+ job_uninstall(j);
+ job_free(j);
+
+ /* Fail depending jobs on failure */
+ if (result != JOB_DONE && recursive) {
+ if (IN_SET(t, JOB_START, JOB_VERIFY_ACTIVE))
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_START_FAILURE);
+ else if (t == JOB_STOP)
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_STOP_FAILURE);
+ }
+
+ /* A special check to make sure we take down anything RequisiteOf= if we aren't active. This is when
+ * the verify-active job merges with a satisfying job type, and then loses its invalidation effect,
+ * as the result there is JOB_DONE for the start job we merged into, while we should be failing the
+ * depending job if the said unit isn't in fact active. Oneshots are an example of this, where going
+ * directly from activating to inactive is success.
+ *
+ * This happens when you use ConditionXYZ= in a unit too, since in that case the job completes with
+ * the JOB_DONE result, but the unit never really becomes active. Note that such a case still
+ * involves merging:
+ *
+ * A start job waits for something else, and a verify-active comes in and merges in the installed
+ * job. Then, later, when it becomes runnable, it finishes with JOB_DONE result as execution on
+ * conditions not being met is skipped, breaking our dependency semantics.
+ *
+ * Also, depending on if start job waits or not, the merging may or may not happen (the verify-active
+ * job may trigger after it finishes), so you get undeterministic results without this check.
+ */
+ if (result == JOB_DONE && recursive &&
+ IN_SET(t, JOB_START, JOB_RELOAD) &&
+ !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ job_fail_dependencies(u, UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE);
+
+ /* Trigger OnFailure= dependencies that are not generated by the unit itself. We don't treat
+ * JOB_CANCELED as failure in this context. And JOB_FAILURE is already handled by the unit itself. */
+ if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) {
+ log_unit_struct(u, LOG_NOTICE,
+ "JOB_TYPE=%s", job_type_to_string(t),
+ "JOB_RESULT=%s", job_result_to_string(result),
+ LOG_UNIT_MESSAGE(u, "Job %s/%s failed with result '%s'.",
+ u->id,
+ job_type_to_string(t),
+ job_result_to_string(result)));
+
+ unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+ }
+
+ unit_trigger_notify(u);
+
+finish:
+ /* Try to start the next jobs that can be started */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_AFTER)
+ if (other->job) {
+ job_add_to_run_queue(other->job);
+ job_add_to_gc_queue(other->job);
+ }
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_BEFORE)
+ if (other->job) {
+ job_add_to_run_queue(other->job);
+ job_add_to_gc_queue(other->job);
+ }
+
+ /* Ensure that when an upheld/unneeded/bound unit activation job fails we requeue it, if it still
+ * necessary. If there are no state changes in the triggerer, it would not be retried otherwise. */
+ unit_submit_to_start_when_upheld_queue(u);
+ unit_submit_to_stop_when_bound_queue(u);
+ unit_submit_to_stop_when_unneeded_queue(u);
+
+ manager_check_finished(u->manager);
+
+ return 0;
+}
+
+static int job_dispatch_timer(sd_event_source *s, uint64_t monotonic, void *userdata) {
+ Job *j = ASSERT_PTR(userdata);
+ Unit *u;
+
+ assert(s == j->timer_event_source);
+
+ log_unit_warning(j->unit, "Job %s/%s timed out.", j->unit->id, job_type_to_string(j->type));
+
+ u = j->unit;
+ job_finish_and_invalidate(j, JOB_TIMEOUT, true, false);
+
+ emergency_action(u->manager, u->job_timeout_action,
+ EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+ u->job_timeout_reboot_arg, -1, "job timed out");
+
+ return 0;
+}
+
+int job_start_timer(Job *j, bool job_running) {
+ int r;
+ usec_t timeout_time, old_timeout_time;
+
+ if (job_running) {
+ j->begin_running_usec = now(CLOCK_MONOTONIC);
+
+ if (j->unit->job_running_timeout == USEC_INFINITY)
+ return 0;
+
+ timeout_time = usec_add(j->begin_running_usec, j->unit->job_running_timeout);
+
+ if (j->timer_event_source) {
+ /* Update only if JobRunningTimeoutSec= results in earlier timeout */
+ r = sd_event_source_get_time(j->timer_event_source, &old_timeout_time);
+ if (r < 0)
+ return r;
+
+ if (old_timeout_time <= timeout_time)
+ return 0;
+
+ return sd_event_source_set_time(j->timer_event_source, timeout_time);
+ }
+ } else {
+ if (j->timer_event_source)
+ return 0;
+
+ j->begin_usec = now(CLOCK_MONOTONIC);
+
+ if (j->unit->job_timeout == USEC_INFINITY)
+ return 0;
+
+ timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+ }
+
+ r = sd_event_add_time(
+ j->manager->event,
+ &j->timer_event_source,
+ CLOCK_MONOTONIC,
+ timeout_time, 0,
+ job_dispatch_timer, j);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(j->timer_event_source, "job-start");
+
+ return 0;
+}
+
+void job_add_to_run_queue(Job *j) {
+ int r;
+
+ assert(j);
+ assert(j->installed);
+
+ if (j->in_run_queue)
+ return;
+
+ r = prioq_put(j->manager->run_queue, j, &j->run_queue_idx);
+ if (r < 0)
+ log_warning_errno(r, "Failed put job in run queue, ignoring: %m");
+ else
+ j->in_run_queue = true;
+
+ manager_trigger_run_queue(j->manager);
+}
+
+void job_add_to_dbus_queue(Job *j) {
+ assert(j);
+ assert(j->installed);
+
+ if (j->in_dbus_queue)
+ return;
+
+ /* We don't check if anybody is subscribed here, since this
+ * job might just have been created and not yet assigned to a
+ * connection/client. */
+
+ LIST_PREPEND(dbus_queue, j->manager->dbus_job_queue, j);
+ j->in_dbus_queue = true;
+}
+
+char *job_dbus_path(Job *j) {
+ char *p;
+
+ assert(j);
+
+ if (asprintf(&p, "/org/freedesktop/systemd1/job/%"PRIu32, j->id) < 0)
+ return NULL;
+
+ return p;
+}
+
+int job_serialize(Job *j, FILE *f) {
+ assert(j);
+ assert(f);
+
+ (void) serialize_item_format(f, "job-id", "%u", j->id);
+ (void) serialize_item(f, "job-type", job_type_to_string(j->type));
+ (void) serialize_item(f, "job-state", job_state_to_string(j->state));
+ (void) serialize_bool(f, "job-irreversible", j->irreversible);
+ (void) serialize_bool(f, "job-sent-dbus-new-signal", j->sent_dbus_new_signal);
+ (void) serialize_bool(f, "job-ignore-order", j->ignore_order);
+
+ if (j->begin_usec > 0)
+ (void) serialize_usec(f, "job-begin", j->begin_usec);
+ if (j->begin_running_usec > 0)
+ (void) serialize_usec(f, "job-begin-running", j->begin_running_usec);
+
+ bus_track_serialize(j->bus_track, f, "subscribed");
+
+ activation_details_serialize(j->activation_details, f);
+
+ /* End marker */
+ fputc('\n', f);
+ return 0;
+}
+
+int job_deserialize(Job *j, FILE *f) {
+ int r;
+
+ assert(j);
+ assert(f);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ size_t k;
+ char *v;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ k = strcspn(l, "=");
+
+ if (l[k] == '=') {
+ l[k] = 0;
+ v = l+k+1;
+ } else
+ v = l+k;
+
+ if (streq(l, "job-id")) {
+
+ if (safe_atou32(v, &j->id) < 0)
+ log_debug("Failed to parse job id value: %s", v);
+
+ } else if (streq(l, "job-type")) {
+ JobType t;
+
+ t = job_type_from_string(v);
+ if (t < 0)
+ log_debug("Failed to parse job type: %s", v);
+ else if (t >= _JOB_TYPE_MAX_IN_TRANSACTION)
+ log_debug("Cannot deserialize job of type: %s", v);
+ else
+ j->type = t;
+
+ } else if (streq(l, "job-state")) {
+ JobState s;
+
+ s = job_state_from_string(v);
+ if (s < 0)
+ log_debug("Failed to parse job state: %s", v);
+ else
+ job_set_state(j, s);
+
+ } else if (streq(l, "job-irreversible")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job irreversible flag: %s", v);
+ else
+ j->irreversible = j->irreversible || b;
+
+ } else if (streq(l, "job-sent-dbus-new-signal")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job sent_dbus_new_signal flag: %s", v);
+ else
+ j->sent_dbus_new_signal = j->sent_dbus_new_signal || b;
+
+ } else if (streq(l, "job-ignore-order")) {
+ int b;
+
+ b = parse_boolean(v);
+ if (b < 0)
+ log_debug("Failed to parse job ignore_order flag: %s", v);
+ else
+ j->ignore_order = j->ignore_order || b;
+
+ } else if (streq(l, "job-begin"))
+ (void) deserialize_usec(v, &j->begin_usec);
+
+ else if (streq(l, "job-begin-running"))
+ (void) deserialize_usec(v, &j->begin_running_usec);
+
+ else if (streq(l, "subscribed")) {
+ if (strv_extend(&j->deserialized_clients, v) < 0)
+ return log_oom();
+
+ } else if (startswith(l, "activation-details")) {
+ if (activation_details_deserialize(l, v, &j->activation_details) < 0)
+ log_debug("Failed to parse job ActivationDetails element: %s", v);
+
+ } else
+ log_debug("Unknown job serialization key: %s", l);
+ }
+
+ return 0;
+}
+
+int job_coldplug(Job *j) {
+ int r;
+ usec_t timeout_time = USEC_INFINITY;
+
+ assert(j);
+
+ /* After deserialization is complete and the bus connection
+ * set up again, let's start watching our subscribers again */
+ (void) bus_job_coldplug_bus_track(j);
+
+ if (j->state == JOB_WAITING)
+ job_add_to_run_queue(j);
+
+ /* Maybe due to new dependencies we don't actually need this job anymore? */
+ job_add_to_gc_queue(j);
+
+ /* Create timer only when job began or began running and the respective timeout is finite.
+ * Follow logic of job_start_timer() if both timeouts are finite */
+ if (j->begin_usec == 0)
+ return 0;
+
+ if (j->unit->job_timeout != USEC_INFINITY)
+ timeout_time = usec_add(j->begin_usec, j->unit->job_timeout);
+
+ if (timestamp_is_set(j->begin_running_usec))
+ timeout_time = MIN(timeout_time, usec_add(j->begin_running_usec, j->unit->job_running_timeout));
+
+ if (timeout_time == USEC_INFINITY)
+ return 0;
+
+ j->timer_event_source = sd_event_source_disable_unref(j->timer_event_source);
+
+ r = sd_event_add_time(
+ j->manager->event,
+ &j->timer_event_source,
+ CLOCK_MONOTONIC,
+ timeout_time, 0,
+ job_dispatch_timer, j);
+ if (r < 0)
+ log_debug_errno(r, "Failed to restart timeout for job: %m");
+
+ (void) sd_event_source_set_description(j->timer_event_source, "job-timeout");
+
+ return r;
+}
+
+void job_shutdown_magic(Job *j) {
+ assert(j);
+
+ /* The shutdown target gets some special treatment here: we
+ * tell the kernel to begin with flushing its disk caches, to
+ * optimize shutdown time a bit. Ideally we wouldn't hardcode
+ * this magic into PID 1. However all other processes aren't
+ * options either since they'd exit much sooner than PID 1 and
+ * asynchronous sync() would cause their exit to be
+ * delayed. */
+
+ if (j->type != JOB_START)
+ return;
+
+ if (!MANAGER_IS_SYSTEM(j->unit->manager))
+ return;
+
+ if (!unit_has_name(j->unit, SPECIAL_SHUTDOWN_TARGET))
+ return;
+
+ /* In case messages on console has been disabled on boot */
+ j->unit->manager->no_console_output = false;
+
+ manager_invalidate_startup_units(j->unit->manager);
+
+ if (detect_container() > 0)
+ return;
+
+ (void) asynchronous_sync(NULL);
+}
+
+int job_get_timeout(Job *j, usec_t *ret) {
+ usec_t x = USEC_INFINITY, y = USEC_INFINITY;
+ Unit *u = ASSERT_PTR(ASSERT_PTR(j)->unit);
+ int r;
+
+ assert(ret);
+
+ if (j->timer_event_source) {
+ r = sd_event_source_get_time(j->timer_event_source, &x);
+ if (r < 0)
+ return r;
+ }
+
+ if (UNIT_VTABLE(u)->get_timeout) {
+ r = UNIT_VTABLE(u)->get_timeout(u, &y);
+ if (r < 0)
+ return r;
+ }
+
+ if (x == USEC_INFINITY && y == USEC_INFINITY) {
+ *ret = 0;
+ return 0;
+ }
+
+ *ret = MIN(x, y);
+ return 1;
+}
+
+bool job_may_gc(Job *j) {
+ Unit *other;
+
+ assert(j);
+
+ /* Checks whether this job should be GC'ed away. We only do this for jobs of units that have no effect on their
+ * own and just track external state. For now the only unit type that qualifies for this are .device units.
+ * Returns true if the job can be collected. */
+
+ if (!UNIT_VTABLE(j->unit)->gc_jobs)
+ return false;
+
+ /* Make sure to send out pending D-Bus events before we unload the unit */
+ if (j->in_dbus_queue)
+ return false;
+
+ if (sd_bus_track_count(j->bus_track) > 0)
+ return false;
+
+ /* FIXME: So this is a bit ugly: for now we don't properly track references made via private bus connections
+ * (because it's nasty, as sd_bus_track doesn't apply to it). We simply remember that the job was once
+ * referenced by one, and reset this whenever we notice that no private bus connections are around. This means
+ * the GC is a bit too conservative when it comes to jobs created by private bus connections. */
+ if (j->ref_by_private_bus) {
+ if (set_isempty(j->unit->manager->private_buses))
+ j->ref_by_private_bus = false;
+ else
+ return false;
+ }
+
+ if (j->type == JOB_NOP)
+ return false;
+
+ /* The logic is inverse to job_is_runnable, we cannot GC as long as we block any job. */
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_BEFORE) < 0)
+ return false;
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER)
+ if (other->job && job_compare(j, other->job, UNIT_ATOM_AFTER) < 0)
+ return false;
+
+ return true;
+}
+
+void job_add_to_gc_queue(Job *j) {
+ assert(j);
+
+ if (j->in_gc_queue)
+ return;
+
+ if (!job_may_gc(j))
+ return;
+
+ LIST_PREPEND(gc_queue, j->unit->manager->gc_job_queue, j);
+ j->in_gc_queue = true;
+}
+
+static int job_compare_id(Job * const *a, Job * const *b) {
+ return CMP((*a)->id, (*b)->id);
+}
+
+static size_t sort_job_list(Job **list, size_t n) {
+ Job *previous = NULL;
+ size_t a, b;
+
+ /* Order by numeric IDs */
+ typesafe_qsort(list, n, job_compare_id);
+
+ /* Filter out duplicates */
+ for (a = 0, b = 0; a < n; a++) {
+
+ if (previous == list[a])
+ continue;
+
+ previous = list[b++] = list[a];
+ }
+
+ return b;
+}
+
+int job_get_before(Job *j, Job*** ret) {
+ _cleanup_free_ Job** list = NULL;
+ Unit *other = NULL;
+ size_t n = 0;
+
+ /* Returns a list of all pending jobs that need to finish before this job may be started. */
+
+ assert(j);
+ assert(ret);
+
+ if (j->ignore_order) {
+ *ret = NULL;
+ return 0;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+ if (!other->job)
+ continue;
+ if (job_compare(j, other->job, UNIT_ATOM_AFTER) <= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+ if (!other->job)
+ continue;
+ if (job_compare(j, other->job, UNIT_ATOM_BEFORE) <= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ n = sort_job_list(list, n);
+
+ *ret = TAKE_PTR(list);
+
+ return (int) n;
+}
+
+int job_get_after(Job *j, Job*** ret) {
+ _cleanup_free_ Job** list = NULL;
+ Unit *other = NULL;
+ size_t n = 0;
+
+ assert(j);
+ assert(ret);
+
+ /* Returns a list of all pending jobs that are waiting for this job to finish. */
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_BEFORE) {
+ if (!other->job)
+ continue;
+
+ if (other->job->ignore_order)
+ continue;
+
+ if (job_compare(j, other->job, UNIT_ATOM_BEFORE) >= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, j->unit, UNIT_ATOM_AFTER) {
+ if (!other->job)
+ continue;
+
+ if (other->job->ignore_order)
+ continue;
+
+ if (job_compare(j, other->job, UNIT_ATOM_AFTER) >= 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, n+1))
+ return -ENOMEM;
+ list[n++] = other->job;
+ }
+
+ n = sort_job_list(list, n);
+
+ *ret = TAKE_PTR(list);
+
+ return (int) n;
+}
+
+static const char* const job_state_table[_JOB_STATE_MAX] = {
+ [JOB_WAITING] = "waiting",
+ [JOB_RUNNING] = "running",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_state, JobState);
+
+static const char* const job_type_table[_JOB_TYPE_MAX] = {
+ [JOB_START] = "start",
+ [JOB_VERIFY_ACTIVE] = "verify-active",
+ [JOB_STOP] = "stop",
+ [JOB_RELOAD] = "reload",
+ [JOB_RELOAD_OR_START] = "reload-or-start",
+ [JOB_RESTART] = "restart",
+ [JOB_TRY_RESTART] = "try-restart",
+ [JOB_TRY_RELOAD] = "try-reload",
+ [JOB_NOP] = "nop",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_type, JobType);
+
+static const char* const job_mode_table[_JOB_MODE_MAX] = {
+ [JOB_FAIL] = "fail",
+ [JOB_REPLACE] = "replace",
+ [JOB_REPLACE_IRREVERSIBLY] = "replace-irreversibly",
+ [JOB_ISOLATE] = "isolate",
+ [JOB_FLUSH] = "flush",
+ [JOB_IGNORE_DEPENDENCIES] = "ignore-dependencies",
+ [JOB_IGNORE_REQUIREMENTS] = "ignore-requirements",
+ [JOB_TRIGGERING] = "triggering",
+ [JOB_RESTART_DEPENDENCIES] = "restart-dependencies",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_mode, JobMode);
+
+static const char* const job_result_table[_JOB_RESULT_MAX] = {
+ [JOB_DONE] = "done",
+ [JOB_CANCELED] = "canceled",
+ [JOB_TIMEOUT] = "timeout",
+ [JOB_FAILED] = "failed",
+ [JOB_DEPENDENCY] = "dependency",
+ [JOB_SKIPPED] = "skipped",
+ [JOB_INVALID] = "invalid",
+ [JOB_ASSERT] = "assert",
+ [JOB_UNSUPPORTED] = "unsupported",
+ [JOB_COLLECTED] = "collected",
+ [JOB_ONCE] = "once",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(job_result, JobResult);
+
+const char* job_type_to_access_method(JobType t) {
+ assert(t >= 0);
+ assert(t < _JOB_TYPE_MAX);
+
+ if (IN_SET(t, JOB_START, JOB_RESTART, JOB_TRY_RESTART))
+ return "start";
+ else if (t == JOB_STOP)
+ return "stop";
+ else
+ return "reload";
+}
+
+/*
+ * assume_dep assumed dependency between units (a is before/after b)
+ *
+ * Returns
+ * 0 jobs are independent,
+ * >0 a should run after b,
+ * <0 a should run before b,
+ *
+ * The logic means that for a service a and a service b where b.After=a:
+ *
+ * start a + start b → 1st step start a, 2nd step start b
+ * start a + stop b → 1st step stop b, 2nd step start a
+ * stop a + start b → 1st step stop a, 2nd step start b
+ * stop a + stop b → 1st step stop b, 2nd step stop a
+ *
+ * This has the side effect that restarts are properly synchronized too.
+ */
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep) {
+ assert(a);
+ assert(b);
+ assert(a->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(b->type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(IN_SET(assume_dep, UNIT_ATOM_AFTER, UNIT_ATOM_BEFORE));
+
+ /* Trivial cases first */
+ if (a->type == JOB_NOP || b->type == JOB_NOP)
+ return 0;
+
+ if (a->ignore_order || b->ignore_order)
+ return 0;
+
+ if (assume_dep == UNIT_ATOM_AFTER)
+ return -job_compare(b, a, UNIT_ATOM_BEFORE);
+
+ /* Let's make it simple, JOB_STOP goes always first (in case both ua and ub stop, then ub's stop goes
+ * first anyway). JOB_RESTART is JOB_STOP in disguise (before it is patched to JOB_START). */
+ if (IN_SET(b->type, JOB_STOP, JOB_RESTART))
+ return 1;
+ else
+ return -1;
+}
+
+void job_set_activation_details(Job *j, ActivationDetails *info) {
+ /* Existing (older) ActivationDetails win, newer ones are discarded. */
+ if (!j || j->activation_details || !info)
+ return; /* Nothing to do. */
+
+ j->activation_details = activation_details_ref(info);
+}
diff --git a/src/core/job.h b/src/core/job.h
new file mode 100644
index 0000000..891d87a
--- /dev/null
+++ b/src/core/job.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-event.h"
+
+#include "list.h"
+#include "unit-dependency-atom.h"
+#include "unit-name.h"
+#include "unit.h"
+
+typedef struct ActivationDetails ActivationDetails;
+typedef struct Job Job;
+typedef struct JobDependency JobDependency;
+typedef enum JobType JobType;
+typedef enum JobState JobState;
+typedef enum JobMode JobMode;
+typedef enum JobResult JobResult;
+
+/* Be careful when changing the job types! Adjust job_merging_table[] accordingly! */
+enum JobType {
+ JOB_START, /* if a unit does not support being started, we'll just wait until it becomes active */
+ JOB_VERIFY_ACTIVE,
+
+ JOB_STOP,
+
+ JOB_RELOAD, /* if running, reload */
+
+ /* Note that restarts are first treated like JOB_STOP, but
+ * then instead of finishing are patched to become
+ * JOB_START. */
+ JOB_RESTART, /* If running, stop. Then start unconditionally. */
+
+ _JOB_TYPE_MAX_MERGING,
+
+ /* JOB_NOP can enter into a transaction, but as it won't pull in
+ * any dependencies and it uses the special 'nop_job' slot in Unit,
+ * it won't have to merge with anything (except possibly into another
+ * JOB_NOP, previously installed). JOB_NOP is special-cased in
+ * job_type_is_*() functions so that the transaction can be
+ * activated. */
+ JOB_NOP = _JOB_TYPE_MAX_MERGING, /* do nothing */
+
+ _JOB_TYPE_MAX_IN_TRANSACTION,
+
+ /* JOB_TRY_RESTART can never appear in a transaction, because
+ * it always collapses into JOB_RESTART or JOB_NOP before entering.
+ * Thus we never need to merge it with anything. */
+ JOB_TRY_RESTART = _JOB_TYPE_MAX_IN_TRANSACTION, /* if running, stop and then start */
+
+ /* Similar to JOB_TRY_RESTART but collapses to JOB_RELOAD or JOB_NOP */
+ JOB_TRY_RELOAD,
+
+ /* JOB_RELOAD_OR_START won't enter into a transaction and cannot result
+ * from transaction merging (there's no way for JOB_RELOAD and
+ * JOB_START to meet in one transaction). It can result from a merge
+ * during job installation, but then it will immediately collapse into
+ * one of the two simpler types. */
+ JOB_RELOAD_OR_START, /* if running, reload, otherwise start */
+
+ _JOB_TYPE_MAX,
+ _JOB_TYPE_INVALID = -EINVAL,
+};
+
+enum JobState {
+ JOB_WAITING,
+ JOB_RUNNING,
+ _JOB_STATE_MAX,
+ _JOB_STATE_INVALID = -EINVAL,
+};
+
+enum JobMode {
+ JOB_FAIL, /* Fail if a conflicting job is already queued */
+ JOB_REPLACE, /* Replace an existing conflicting job */
+ JOB_REPLACE_IRREVERSIBLY,/* Like JOB_REPLACE + produce irreversible jobs */
+ JOB_ISOLATE, /* Start a unit, and stop all others */
+ JOB_FLUSH, /* Flush out all other queued jobs when queueing this one */
+ JOB_IGNORE_DEPENDENCIES, /* Ignore both requirement and ordering dependencies */
+ JOB_IGNORE_REQUIREMENTS, /* Ignore requirement dependencies */
+ JOB_TRIGGERING, /* Adds TRIGGERED_BY dependencies to the same transaction */
+ JOB_RESTART_DEPENDENCIES,/* A "start" job for the specified unit becomes "restart" for depending units */
+ _JOB_MODE_MAX,
+ _JOB_MODE_INVALID = -EINVAL,
+};
+
+enum JobResult {
+ JOB_DONE, /* Job completed successfully (or skipped due to an unmet ConditionXYZ=) */
+ JOB_CANCELED, /* Job canceled by a conflicting job installation or by explicit cancel request */
+ JOB_TIMEOUT, /* Job timeout elapsed */
+ JOB_FAILED, /* Job failed */
+ JOB_DEPENDENCY, /* A required dependency job did not result in JOB_DONE */
+ JOB_SKIPPED, /* Negative result of JOB_VERIFY_ACTIVE or skip due to ExecCondition= */
+ JOB_INVALID, /* JOB_RELOAD of inactive unit */
+ JOB_ASSERT, /* Couldn't start a unit, because an assert didn't hold */
+ JOB_UNSUPPORTED, /* Couldn't start a unit, because the unit type is not supported on the system */
+ JOB_COLLECTED, /* Job was garbage collected, since nothing needed it anymore */
+ JOB_ONCE, /* Unit was started before, and hence can't be started again */
+ _JOB_RESULT_MAX,
+ _JOB_RESULT_INVALID = -EINVAL,
+};
+
+struct JobDependency {
+ /* Encodes that the 'subject' job needs the 'object' job in
+ * some way. This structure is used only while building a transaction. */
+ Job *subject;
+ Job *object;
+
+ LIST_FIELDS(JobDependency, subject);
+ LIST_FIELDS(JobDependency, object);
+
+ bool matters:1;
+ bool conflicts:1;
+};
+
+struct Job {
+ Manager *manager;
+ Unit *unit;
+
+ LIST_FIELDS(Job, transaction);
+ LIST_FIELDS(Job, dbus_queue);
+ LIST_FIELDS(Job, gc_queue);
+
+ LIST_HEAD(JobDependency, subject_list);
+ LIST_HEAD(JobDependency, object_list);
+
+ /* Used for graph algs as a "I have been here" marker */
+ Job* marker;
+ unsigned generation;
+
+ uint32_t id;
+
+ JobType type;
+ JobState state;
+
+ sd_event_source *timer_event_source;
+ usec_t begin_usec;
+ usec_t begin_running_usec;
+
+ /*
+ * This tracks where to send signals, and also which clients
+ * are allowed to call DBus methods on the job (other than
+ * root).
+ *
+ * There can be more than one client, because of job merging.
+ */
+ sd_bus_track *bus_track;
+ char **deserialized_clients;
+
+ JobResult result;
+
+ unsigned run_queue_idx;
+
+ /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+ ActivationDetails *activation_details;
+
+ bool installed:1;
+ bool in_run_queue:1;
+ bool matters_to_anchor:1;
+ bool in_dbus_queue:1;
+ bool sent_dbus_new_signal:1;
+ bool ignore_order:1;
+ bool irreversible:1;
+ bool in_gc_queue:1;
+ bool ref_by_private_bus:1;
+};
+
+Job* job_new(Unit *unit, JobType type);
+Job* job_new_raw(Unit *unit);
+void job_unlink(Job *job);
+Job* job_free(Job *job);
+Job* job_install(Job *j, bool refuse_late_merge);
+int job_install_deserialized(Job *j);
+void job_uninstall(Job *j);
+void job_dump(Job *j, FILE *f, const char *prefix);
+int job_serialize(Job *j, FILE *f);
+int job_deserialize(Job *j, FILE *f);
+int job_coldplug(Job *j);
+
+JobDependency* job_dependency_new(Job *subject, Job *object, bool matters, bool conflicts);
+void job_dependency_free(JobDependency *l);
+
+int job_merge(Job *j, Job *other);
+
+JobType job_type_lookup_merge(JobType a, JobType b) _pure_;
+
+_pure_ static inline bool job_type_is_mergeable(JobType a, JobType b) {
+ return job_type_lookup_merge(a, b) >= 0;
+}
+
+_pure_ static inline bool job_type_is_conflicting(JobType a, JobType b) {
+ return a != JOB_NOP && b != JOB_NOP && !job_type_is_mergeable(a, b);
+}
+
+_pure_ static inline bool job_type_is_superset(JobType a, JobType b) {
+ /* Checks whether operation a is a "superset" of b in its actions */
+ if (b == JOB_NOP)
+ return true;
+ if (a == JOB_NOP)
+ return false;
+ return a == job_type_lookup_merge(a, b);
+}
+
+bool job_type_is_redundant(JobType a, UnitActiveState b) _pure_;
+
+/* Collapses a state-dependent job type into a simpler type by observing
+ * the state of the unit which it is going to be applied to. */
+JobType job_type_collapse(JobType t, Unit *u);
+
+int job_type_merge_and_collapse(JobType *a, JobType b, Unit *u);
+
+void job_add_to_run_queue(Job *j);
+void job_add_to_dbus_queue(Job *j);
+
+int job_start_timer(Job *j, bool job_running);
+
+int job_run_and_invalidate(Job *j);
+int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already);
+
+char *job_dbus_path(Job *j);
+
+void job_shutdown_magic(Job *j);
+
+int job_get_timeout(Job *j, usec_t *ret);
+
+bool job_may_gc(Job *j);
+void job_add_to_gc_queue(Job *j);
+
+int job_get_before(Job *j, Job*** ret);
+int job_get_after(Job *j, Job*** ret);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Job*, job_free);
+
+const char* job_type_to_string(JobType t) _const_;
+JobType job_type_from_string(const char *s) _pure_;
+
+const char* job_state_to_string(JobState t) _const_;
+JobState job_state_from_string(const char *s) _pure_;
+
+const char* job_mode_to_string(JobMode t) _const_;
+JobMode job_mode_from_string(const char *s) _pure_;
+
+const char* job_result_to_string(JobResult t) _const_;
+JobResult job_result_from_string(const char *s) _pure_;
+
+const char* job_type_to_access_method(JobType t);
+
+int job_compare(Job *a, Job *b, UnitDependencyAtom assume_dep);
+
+void job_set_activation_details(Job *j, ActivationDetails *info);
diff --git a/src/core/kill.c b/src/core/kill.c
new file mode 100644
index 0000000..c8b581d
--- /dev/null
+++ b/src/core/kill.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "kill.h"
+#include "signal-util.h"
+#include "string-table.h"
+
+void kill_context_init(KillContext *c) {
+ assert(c);
+
+ c->kill_signal = SIGTERM;
+ /* restart_kill_signal is unset by default and we fall back to kill_signal */
+ c->final_kill_signal = SIGKILL;
+ c->send_sigkill = true;
+ c->send_sighup = false;
+ c->watchdog_signal = SIGABRT;
+}
+
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix) {
+ assert(c);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%sKillMode: %s\n"
+ "%sKillSignal: SIG%s\n"
+ "%sRestartKillSignal: SIG%s\n"
+ "%sFinalKillSignal: SIG%s\n"
+ "%sSendSIGKILL: %s\n"
+ "%sSendSIGHUP: %s\n",
+ prefix, kill_mode_to_string(c->kill_mode),
+ prefix, signal_to_string(c->kill_signal),
+ prefix, signal_to_string(restart_kill_signal(c)),
+ prefix, signal_to_string(c->final_kill_signal),
+ prefix, yes_no(c->send_sigkill),
+ prefix, yes_no(c->send_sighup));
+}
+
+static const char* const kill_mode_table[_KILL_MODE_MAX] = {
+ [KILL_CONTROL_GROUP] = "control-group",
+ [KILL_PROCESS] = "process",
+ [KILL_MIXED] = "mixed",
+ [KILL_NONE] = "none",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_mode, KillMode);
+
+static const char* const kill_who_table[_KILL_WHO_MAX] = {
+ [KILL_MAIN] = "main",
+ [KILL_CONTROL] = "control",
+ [KILL_ALL] = "all",
+ [KILL_MAIN_FAIL] = "main-fail",
+ [KILL_CONTROL_FAIL] = "control-fail",
+ [KILL_ALL_FAIL] = "all-fail",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(kill_who, KillWho);
diff --git a/src/core/kill.h b/src/core/kill.h
new file mode 100644
index 0000000..dbf884d
--- /dev/null
+++ b/src/core/kill.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct KillContext KillContext;
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "macro.h"
+
+typedef enum KillMode {
+ /* The kill mode is a property of a unit. */
+ KILL_CONTROL_GROUP = 0,
+ KILL_PROCESS,
+ KILL_MIXED,
+ KILL_NONE,
+ _KILL_MODE_MAX,
+ _KILL_MODE_INVALID = -EINVAL,
+} KillMode;
+
+struct KillContext {
+ KillMode kill_mode;
+ int kill_signal;
+ int restart_kill_signal;
+ int final_kill_signal;
+ int watchdog_signal;
+ bool send_sigkill;
+ bool send_sighup;
+};
+
+typedef enum KillWho {
+ /* Kill who is a property of an operation */
+ KILL_MAIN,
+ KILL_CONTROL,
+ KILL_ALL,
+ KILL_MAIN_FAIL,
+ KILL_CONTROL_FAIL,
+ KILL_ALL_FAIL,
+ _KILL_WHO_MAX,
+ _KILL_WHO_INVALID = -EINVAL,
+} KillWho;
+
+void kill_context_init(KillContext *c);
+void kill_context_dump(KillContext *c, FILE *f, const char *prefix);
+
+const char *kill_mode_to_string(KillMode k) _const_;
+KillMode kill_mode_from_string(const char *s) _pure_;
+
+const char *kill_who_to_string(KillWho k) _const_;
+KillWho kill_who_from_string(const char *s) _pure_;
+
+static inline int restart_kill_signal(const KillContext *c) {
+ if (c->restart_kill_signal != 0)
+ return c->restart_kill_signal;
+ return c->kill_signal;
+}
diff --git a/src/core/kmod-setup.c b/src/core/kmod-setup.c
new file mode 100644
index 0000000..b8e3f7a
--- /dev/null
+++ b/src/core/kmod-setup.c
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "efi-api.h"
+#include "fileio.h"
+#include "kmod-setup.h"
+#include "macro.h"
+#include "recurse-dir.h"
+#include "string-util.h"
+#include "strv.h"
+#include "virt.h"
+
+#if HAVE_KMOD
+#include "module-util.h"
+
+static void systemd_kmod_log(
+ void *data,
+ int priority,
+ const char *file, int line,
+ const char *fn,
+ const char *format,
+ va_list args) {
+
+ /* library logging is enabled at debug only */
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_internalv(LOG_DEBUG, 0, file, line, fn, format, args);
+ REENABLE_WARNING;
+}
+
+static int match_modalias_recurse_dir_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ _cleanup_free_ char *alias = NULL;
+ char **modaliases = ASSERT_PTR(userdata);
+ int r;
+
+ if (event != RECURSE_DIR_ENTRY)
+ return RECURSE_DIR_CONTINUE;
+
+ if (de->d_type != DT_REG)
+ return RECURSE_DIR_CONTINUE;
+
+ if (!streq(de->d_name, "modalias"))
+ return RECURSE_DIR_CONTINUE;
+
+ r = read_one_line_file(path, &alias);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read %s, ignoring: %m", path);
+ return RECURSE_DIR_LEAVE_DIRECTORY;
+ }
+
+ if (startswith_strv(alias, modaliases))
+ return 1;
+
+ return RECURSE_DIR_LEAVE_DIRECTORY;
+}
+
+static bool has_virtio_feature(const char *name, char **modaliases) {
+ int r;
+
+ /* Directory traversal might be slow, hence let's do a cheap check first if it's even worth it */
+ if (detect_vm() == VIRTUALIZATION_NONE)
+ return false;
+
+ r = recurse_dir_at(
+ AT_FDCWD,
+ "/sys/devices/pci0000:00",
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ 3,
+ RECURSE_DIR_ENSURE_TYPE,
+ match_modalias_recurse_dir_cb,
+ modaliases);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine whether host has %s device, ignoring: %m", name);
+
+ return r > 0;
+}
+
+static bool has_virtio_rng(void) {
+ return has_virtio_feature("virtio-rng", STRV_MAKE("pci:v00001AF4d00001005", "pci:v00001AF4d00001044"));
+}
+
+static bool has_virtio_console(void) {
+ return has_virtio_feature("virtio-console", STRV_MAKE("virtio:d00000003v", "virtio:d0000000Bv"));
+}
+
+static bool has_virtio_vsock(void) {
+ return has_virtio_feature("virtio-vsock", STRV_MAKE("virtio:d00000013v"));
+}
+
+static bool has_virtiofs(void) {
+ return has_virtio_feature("virtiofs", STRV_MAKE("virtio:d0000001Av"));
+}
+
+static bool has_virtio_pci(void) {
+ return has_virtio_feature("virtio-pci", STRV_MAKE("pci:v00001AF4d"));
+}
+
+static bool in_qemu(void) {
+ return IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_QEMU);
+}
+#endif
+
+int kmod_setup(void) {
+#if HAVE_KMOD
+
+ static const struct {
+ const char *module;
+ const char *path;
+ bool warn_if_unavailable:1;
+ bool warn_if_module:1;
+ bool (*condition_fn)(void);
+ } kmod_table[] = {
+ /* This one we need to load explicitly, since auto-loading on use doesn't work
+ * before udev created the ghost device nodes, and we need it earlier than that. */
+ { "autofs4", "/sys/class/misc/autofs", true, false, NULL },
+
+ /* This one we need to load explicitly, since auto-loading of IPv6 is not done when
+ * we try to configure ::1 on the loopback device. */
+ { "ipv6", "/sys/module/ipv6", false, true, NULL },
+
+ /* This should never be a module */
+ { "unix", "/proc/net/unix", true, true, NULL },
+
+#if HAVE_LIBIPTC
+ /* netfilter is needed by networkd, nspawn among others, and cannot be autoloaded */
+ { "ip_tables", "/proc/net/ip_tables_names", false, false, NULL },
+#endif
+ /* virtio_rng would be loaded by udev later, but real entropy might be needed very early */
+ { "virtio_rng", NULL, false, false, has_virtio_rng },
+
+ /* we want early logging to hvc consoles if possible, and make sure systemd-getty-generator
+ * can rely on all consoles being probed already.*/
+ { "virtio_console", NULL, false, false, has_virtio_console },
+
+ /* Make sure we can send sd-notify messages over vsock as early as possible. */
+ { "vmw_vsock_virtio_transport", NULL, false, false, has_virtio_vsock },
+
+ /* We can't wait for specific virtiofs tags to show up as device nodes so we have to load the
+ * virtiofs and virtio_pci modules early to make sure the virtiofs tags are found when
+ * sysroot.mount is started.
+ *
+ * TODO: Remove these again once https://gitlab.com/virtio-fs/virtiofsd/-/issues/128 is
+ * resolved and the kernel fix is widely available. */
+ { "virtiofs", "/sys/module/virtiofs", false, false, has_virtiofs },
+ { "virtio_pci", "/sys/module/virtio_pci", false, false, has_virtio_pci },
+
+ /* qemu_fw_cfg would be loaded by udev later, but we want to import credentials from it super early */
+ { "qemu_fw_cfg", "/sys/firmware/qemu_fw_cfg", false, false, in_qemu },
+
+ /* dmi-sysfs is needed to import credentials from it super early */
+ { "dmi-sysfs", "/sys/firmware/dmi/entries", false, false, NULL },
+
+#if HAVE_TPM2
+ /* Make sure the tpm subsystem is available which ConditionSecurity=tpm2 depends on. */
+ { "tpm", "/sys/class/tpmrm", false, false, efi_has_tpm2 },
+#endif
+ };
+ _cleanup_(kmod_unrefp) struct kmod_ctx *ctx = NULL;
+ unsigned i;
+
+ if (have_effective_cap(CAP_SYS_MODULE) <= 0)
+ return 0;
+
+ for (i = 0; i < ELEMENTSOF(kmod_table); i++) {
+ if (kmod_table[i].path && access(kmod_table[i].path, F_OK) >= 0)
+ continue;
+
+ if (kmod_table[i].condition_fn && !kmod_table[i].condition_fn())
+ continue;
+
+ if (kmod_table[i].warn_if_module)
+ log_debug("Your kernel apparently lacks built-in %s support. Might be "
+ "a good idea to compile it in. We'll now try to work around "
+ "this by loading the module...", kmod_table[i].module);
+
+ if (!ctx) {
+ ctx = kmod_new(NULL, NULL);
+ if (!ctx)
+ return log_oom();
+
+ kmod_set_log_fn(ctx, systemd_kmod_log, NULL);
+ kmod_load_resources(ctx);
+ }
+
+ (void) module_load_and_warn(ctx, kmod_table[i].module, kmod_table[i].warn_if_unavailable);
+ }
+
+#endif
+ return 0;
+}
diff --git a/src/core/kmod-setup.h b/src/core/kmod-setup.h
new file mode 100644
index 0000000..1c842d3
--- /dev/null
+++ b/src/core/kmod-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int kmod_setup(void);
diff --git a/src/core/load-dropin.c b/src/core/load-dropin.c
new file mode 100644
index 0000000..fd45744
--- /dev/null
+++ b/src/core/load-dropin.c
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "fs-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static int process_deps(Unit *u, UnitDependency dependency, const char *dir_suffix) {
+ _cleanup_strv_free_ char **paths = NULL;
+ int r;
+
+ r = unit_file_find_dropin_paths(NULL,
+ u->manager->lookup_paths.search_path,
+ u->manager->unit_path_cache,
+ dir_suffix, NULL,
+ u->id, u->aliases,
+ &paths);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, paths) {
+ _cleanup_free_ char *target = NULL;
+ const char *entry;
+
+ entry = basename(*p);
+
+ if (null_or_empty_path(*p) > 0) {
+ /* an error usually means an invalid symlink, which is not a mask */
+ log_unit_debug(u, "%s dependency on %s is masked by %s, ignoring.",
+ unit_dependency_to_string(dependency), entry, *p);
+ continue;
+ }
+
+ r = is_symlink(*p);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "%s dropin %s unreadable, ignoring: %m",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+ if (r == 0) {
+ log_unit_warning(u, "%s dependency dropin %s is not a symlink, ignoring.",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+
+ if (!unit_name_is_valid(entry, UNIT_NAME_ANY)) {
+ log_unit_warning(u, "%s dependency dropin %s is not a valid unit name, ignoring.",
+ unit_dependency_to_string(dependency), *p);
+ continue;
+ }
+
+ r = readlink_malloc(*p, &target);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "readlink(\"%s\") failed, ignoring: %m", *p);
+ continue;
+ }
+
+ /* We don't treat this as an error, especially because we didn't check this for a
+ * long time. Nevertheless, we warn, because such mismatch can be mighty confusing. */
+ r = unit_symlink_name_compatible(entry, basename(target), u->instance);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Can't check if names %s and %s are compatible, ignoring: %m",
+ entry, basename(target));
+ continue;
+ }
+ if (r == 0)
+ log_unit_warning(u, "%s dependency dropin %s target %s has different name",
+ unit_dependency_to_string(dependency), *p, target);
+
+ r = unit_add_dependency_by_name(u, dependency, entry, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add %s dependency on %s, ignoring: %m",
+ unit_dependency_to_string(dependency), entry);
+ }
+
+ return 0;
+}
+
+int unit_load_dropin(Unit *u) {
+ _cleanup_strv_free_ char **l = NULL;
+ int r;
+
+ assert(u);
+
+ /* Load dependencies from .wants, .requires and .upholds directories */
+ r = process_deps(u, UNIT_WANTS, ".wants");
+ if (r < 0)
+ return r;
+
+ r = process_deps(u, UNIT_REQUIRES, ".requires");
+ if (r < 0)
+ return r;
+
+ r = process_deps(u, UNIT_UPHOLDS, ".upholds");
+ if (r < 0)
+ return r;
+
+ /* Load .conf dropins */
+ r = unit_find_dropin_paths(u, &l);
+ if (r <= 0)
+ return 0;
+
+ if (!u->dropin_paths)
+ u->dropin_paths = TAKE_PTR(l);
+ else {
+ r = strv_extend_strv(&u->dropin_paths, l, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ u->dropin_mtime = 0;
+ STRV_FOREACH(f, u->dropin_paths) {
+ struct stat st;
+
+ r = config_parse(u->id, *f, NULL,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0, u, &st);
+ if (r > 0)
+ u->dropin_mtime = MAX(u->dropin_mtime, timespec_load(&st.st_mtim));
+ }
+
+ return 0;
+}
diff --git a/src/core/load-dropin.h b/src/core/load-dropin.h
new file mode 100644
index 0000000..f0b87d3
--- /dev/null
+++ b/src/core/load-dropin.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "dropin.h"
+#include "unit.h"
+
+/* Read service data supplementary drop-in directories */
+
+static inline int unit_find_dropin_paths(Unit *u, char ***paths) {
+ assert(u);
+
+ return unit_file_find_dropin_paths(NULL,
+ u->manager->lookup_paths.search_path,
+ u->manager->unit_path_cache,
+ ".d", ".conf",
+ u->id, u->aliases,
+ paths);
+}
+
+int unit_load_dropin(Unit *u);
diff --git a/src/core/load-fragment-gperf-nulstr.awk b/src/core/load-fragment-gperf-nulstr.awk
new file mode 100644
index 0000000..a1b7d1c
--- /dev/null
+++ b/src/core/load-fragment-gperf-nulstr.awk
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+BEGIN{
+ keywords=0 ; FS="," ;
+ print "extern const char load_fragment_gperf_nulstr[];" ;
+ print "const char load_fragment_gperf_nulstr[] ="
+}
+keyword==1 {
+ print "\"" $1 "\\0\""
+}
+/%%/ {
+ keyword=1
+}
+END {
+ print ";"
+}
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
new file mode 100644
index 0000000..45f9ab0
--- /dev/null
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -0,0 +1,595 @@
+{# SPDX-License-Identifier: LGPL-2.1-or-later #}
+
+{%- macro EXEC_CONTEXT_CONFIG_ITEMS(type) -%}
+{# Define the context options only once #}
+{{type}}.WorkingDirectory, config_parse_working_directory, 0, offsetof({{type}}, exec_context)
+{{type}}.RootDirectory, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_directory)
+{{type}}.RootImage, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_image)
+{{type}}.RootImageOptions, config_parse_root_image_options, 0, offsetof({{type}}, exec_context)
+{{type}}.RootImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.root_image_policy)
+{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context)
+{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context)
+{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity)
+{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral)
+{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories)
+{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context)
+{{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy)
+{{type}}.MountImages, config_parse_mount_images, 0, offsetof({{type}}, exec_context)
+{{type}}.MountImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.mount_image_policy)
+{{type}}.User, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.user)
+{{type}}.Group, config_parse_user_group_compat, 0, offsetof({{type}}, exec_context.group)
+{{type}}.SupplementaryGroups, config_parse_user_group_strv_compat, 0, offsetof({{type}}, exec_context.supplementary_groups)
+{{type}}.SetLoginEnvironment, config_parse_tristate, 0, offsetof({{type}}, exec_context.set_login_environment)
+{{type}}.Nice, config_parse_exec_nice, 0, offsetof({{type}}, exec_context)
+{{type}}.OOMScoreAdjust, config_parse_exec_oom_score_adjust, 0, offsetof({{type}}, exec_context)
+{{type}}.CoredumpFilter, config_parse_exec_coredump_filter, 0, offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingClass, config_parse_exec_io_class, 0, offsetof({{type}}, exec_context)
+{{type}}.IOSchedulingPriority, config_parse_exec_io_priority, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof({{type}}, exec_context)
+{{type}}.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof({{type}}, exec_context.cpu_sched_reset_on_fork)
+{{type}}.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof({{type}}, exec_context)
+{{type}}.NUMAPolicy, config_parse_numa_policy, 0, offsetof({{type}}, exec_context.numa_policy.type)
+{{type}}.NUMAMask, config_parse_numa_mask, 0, offsetof({{type}}, exec_context.numa_policy)
+{{type}}.UMask, config_parse_mode, 0, offsetof({{type}}, exec_context.umask)
+{{type}}.Environment, config_parse_environ, 0, offsetof({{type}}, exec_context.environment)
+{{type}}.EnvironmentFile, config_parse_unit_env_file, 0, offsetof({{type}}, exec_context.environment_files)
+{{type}}.PassEnvironment, config_parse_pass_environ, 0, offsetof({{type}}, exec_context.pass_environment)
+{{type}}.UnsetEnvironment, config_parse_unset_environ, 0, offsetof({{type}}, exec_context.unset_environment)
+{{type}}.DynamicUser, config_parse_bool, true, offsetof({{type}}, exec_context.dynamic_user)
+{{type}}.RemoveIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.remove_ipc)
+{{type}}.StandardInput, config_parse_exec_input, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardOutput, config_parse_exec_output, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardError, config_parse_exec_output, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardInputText, config_parse_exec_input_text, 0, offsetof({{type}}, exec_context)
+{{type}}.StandardInputData, config_parse_exec_input_data, 0, offsetof({{type}}, exec_context)
+{{type}}.TTYPath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.tty_path)
+{{type}}.TTYReset, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_reset)
+{{type}}.TTYVHangup, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vhangup)
+{{type}}.TTYVTDisallocate, config_parse_bool, 0, offsetof({{type}}, exec_context.tty_vt_disallocate)
+{{type}}.TTYRows, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_rows)
+{{type}}.TTYColumns, config_parse_tty_size, 0, offsetof({{type}}, exec_context.tty_cols)
+{{type}}.SyslogIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.syslog_identifier)
+{{type}}.SyslogFacility, config_parse_log_facility, 0, offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevel, config_parse_log_level, 0, offsetof({{type}}, exec_context.syslog_priority)
+{{type}}.SyslogLevelPrefix, config_parse_bool, 0, offsetof({{type}}, exec_context.syslog_level_prefix)
+{{type}}.LogLevelMax, config_parse_log_level, 0, offsetof({{type}}, exec_context.log_level_max)
+{{type}}.LogRateLimitIntervalSec, config_parse_sec, 0, offsetof({{type}}, exec_context.log_ratelimit_interval_usec)
+{{type}}.LogRateLimitBurst, config_parse_unsigned, 0, offsetof({{type}}, exec_context.log_ratelimit_burst)
+{{type}}.LogExtraFields, config_parse_log_extra_fields, 0, offsetof({{type}}, exec_context)
+{{type}}.LogFilterPatterns, config_parse_log_filter_patterns, 0, offsetof({{type}}, exec_context)
+{{type}}.Capabilities, config_parse_warn_compat, DISABLED_LEGACY, offsetof({{type}}, exec_context)
+{{type}}.SecureBits, config_parse_exec_secure_bits, 0, offsetof({{type}}, exec_context.secure_bits)
+{{type}}.CapabilityBoundingSet, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_bounding_set)
+{{type}}.AmbientCapabilities, config_parse_capability_set, 0, offsetof({{type}}, exec_context.capability_ambient_set)
+{{type}}.TimerSlackNSec, config_parse_nsec, 0, offsetof({{type}}, exec_context.timer_slack_nsec)
+{{type}}.NoNewPrivileges, config_parse_bool, 0, offsetof({{type}}, exec_context.no_new_privileges)
+{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
+{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
+{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
+{% if HAVE_SECCOMP %}
+{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
+{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)
+{{type}}.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof({{type}}, exec_context)
+{{type}}.SystemCallLog, config_parse_syscall_log, 0, offsetof({{type}}, exec_context)
+{{type}}.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof({{type}}, exec_context.memory_deny_write_execute)
+{{type}}.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof({{type}}, exec_context)
+{{type}}.RestrictRealtime, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_realtime)
+{{type}}.RestrictSUIDSGID, config_parse_bool, 0, offsetof({{type}}, exec_context.restrict_suid_sgid)
+{{type}}.RestrictAddressFamilies, config_parse_address_families, 0, offsetof({{type}}, exec_context)
+{{type}}.LockPersonality, config_parse_bool, 0, offsetof({{type}}, exec_context.lock_personality)
+{% else %}
+{{type}}.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.SystemCallLog, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictSUIDSGID, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{{type}}.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.RestrictFileSystems, config_parse_restrict_filesystems, 0, offsetof({{type}}, exec_context)
+{{type}}.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof({{type}}, exec_context.rlimit)
+{{type}}.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof({{type}}, exec_context.rlimit)
+{{type}}.ReadWriteDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessibleDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ReadWritePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_write_paths)
+{{type}}.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.read_only_paths)
+{{type}}.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.inaccessible_paths)
+{{type}}.ExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.exec_paths)
+{{type}}.NoExecPaths, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.no_exec_paths)
+{{type}}.ExecSearchPath, config_parse_colon_separated_paths, 0, offsetof({{type}}, exec_context.exec_search_path)
+{{type}}.BindPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context)
+{{type}}.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context)
+{{type}}.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof({{type}}, exec_context)
+{{type}}.PrivateTmp, config_parse_bool, 0, offsetof({{type}}, exec_context.private_tmp)
+{{type}}.PrivateDevices, config_parse_bool, 0, offsetof({{type}}, exec_context.private_devices)
+{{type}}.ProtectKernelTunables, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_tunables)
+{{type}}.ProtectKernelModules, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_modules)
+{{type}}.ProtectKernelLogs, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_logs)
+{{type}}.ProtectClock, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_clock)
+{{type}}.ProtectControlGroups, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_control_groups)
+{{type}}.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.network_namespace_path)
+{{type}}.IPCNamespacePath, config_parse_unit_path_printf, 0, offsetof({{type}}, exec_context.ipc_namespace_path)
+{{type}}.LogNamespace, config_parse_log_namespace, 0, offsetof({{type}}, exec_context)
+{{type}}.PrivateNetwork, config_parse_bool, 0, offsetof({{type}}, exec_context.private_network)
+{{type}}.PrivateUsers, config_parse_bool, 0, offsetof({{type}}, exec_context.private_users)
+{{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts)
+{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc)
+{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
+{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
+{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag)
+{{type}}.MountAPIVFS, config_parse_exec_mount_apivfs, 0, offsetof({{type}}, exec_context)
+{{type}}.Personality, config_parse_personality, 0, offsetof({{type}}, exec_context.personality)
+{{type}}.RuntimeDirectoryPreserve, config_parse_exec_preserve_mode, 0, offsetof({{type}}, exec_context.runtime_directory_preserve_mode)
+{{type}}.RuntimeDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME].mode)
+{{type}}.RuntimeDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_RUNTIME])
+{{type}}.StateDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE].mode)
+{{type}}.StateDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_STATE])
+{{type}}.CacheDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE].mode)
+{{type}}.CacheDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CACHE])
+{{type}}.LogsDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS].mode)
+{{type}}.LogsDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_LOGS])
+{{type}}.ConfigurationDirectoryMode, config_parse_mode, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION].mode)
+{{type}}.ConfigurationDirectory, config_parse_exec_directories, 0, offsetof({{type}}, exec_context.directories[EXEC_DIRECTORY_CONFIGURATION])
+{{type}}.SetCredential, config_parse_set_credential, 0, offsetof({{type}}, exec_context)
+{{type}}.SetCredentialEncrypted, config_parse_set_credential, 1, offsetof({{type}}, exec_context)
+{{type}}.LoadCredential, config_parse_load_credential, 0, offsetof({{type}}, exec_context)
+{{type}}.LoadCredentialEncrypted, config_parse_load_credential, 1, offsetof({{type}}, exec_context)
+{{type}}.ImportCredential, config_parse_import_credential, 0, offsetof({{type}}, exec_context.import_credentials)
+{{type}}.TimeoutCleanSec, config_parse_sec, 0, offsetof({{type}}, exec_context.timeout_clean_usec)
+{% if HAVE_PAM %}
+{{type}}.PAMName, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.pam_name)
+{% else %}
+{{type}}.PAMName, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.IgnoreSIGPIPE, config_parse_bool, 0, offsetof({{type}}, exec_context.ignore_sigpipe)
+{{type}}.UtmpIdentifier, config_parse_unit_string_printf, 0, offsetof({{type}}, exec_context.utmp_id)
+{{type}}.UtmpMode, config_parse_exec_utmp_mode, 0, offsetof({{type}}, exec_context.utmp_mode)
+{% if HAVE_SELINUX %}
+{{type}}.SELinuxContext, config_parse_exec_selinux_context, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SELinuxContext, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if HAVE_APPARMOR %}
+{{type}}.AppArmorProfile, config_parse_exec_apparmor_profile, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.AppArmorProfile, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if ENABLE_SMACK %}
+{{type}}.SmackProcessLabel, config_parse_exec_smack_process_label, 0, offsetof({{type}}, exec_context)
+{% else %}
+{{type}}.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{type}}.ProtectHostname, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_hostname)
+{{type}}.MemoryKSM, config_parse_tristate, 0, offsetof({{type}}, exec_context.memory_ksm)
+{%- endmacro -%}
+
+{%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.SendSIGKILL, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sigkill)
+{{type}}.SendSIGHUP, config_parse_bool, 0, offsetof({{type}}, kill_context.send_sighup)
+{{type}}.KillMode, config_parse_kill_mode, 0, offsetof({{type}}, kill_context.kill_mode)
+{{type}}.KillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.kill_signal)
+{{type}}.RestartKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.restart_kill_signal)
+{{type}}.FinalKillSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.final_kill_signal)
+{{type}}.WatchdogSignal, config_parse_signal, 0, offsetof({{type}}, kill_context.watchdog_signal)
+{%- endmacro -%}
+
+{%- macro CGROUP_CONTEXT_CONFIG_ITEMS(type) -%}
+{{type}}.Slice, config_parse_unit_slice, 0, 0
+{{type}}.AllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_cpus)
+{{type}}.StartupAllowedCPUs, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_cpus)
+{{type}}.AllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.cpuset_mems)
+{{type}}.StartupAllowedMemoryNodes, config_parse_allowed_cpuset, 0, offsetof({{type}}, cgroup_context.startup_cpuset_mems)
+{{type}}.CPUAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.cpu_accounting)
+{{type}}.CPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.cpu_weight)
+{{type}}.StartupCPUWeight, config_parse_cg_cpu_weight, 0, offsetof({{type}}, cgroup_context.startup_cpu_weight)
+{{type}}.CPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.cpu_shares)
+{{type}}.StartupCPUShares, config_parse_cpu_shares, 0, offsetof({{type}}, cgroup_context.startup_cpu_shares)
+{{type}}.CPUQuota, config_parse_cpu_quota, 0, offsetof({{type}}, cgroup_context)
+{{type}}.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof({{type}}, cgroup_context.cpu_quota_period_usec)
+{{type}}.MemoryAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.memory_accounting)
+{{type}}.MemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DefaultMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DefaultStartupMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryLow, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryHigh, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryHigh, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemorySwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.StartupMemoryZSwapMax, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryLimit, config_parse_memory_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DeviceAllow, config_parse_device_allow, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DevicePolicy, config_parse_device_policy, 0, offsetof({{type}}, cgroup_context.device_policy)
+{{type}}.IOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.io_accounting)
+{{type}}.IOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.io_weight)
+{{type}}.StartupIOWeight, config_parse_cg_weight, 0, offsetof({{type}}, cgroup_context.startup_io_weight)
+{{type}}.IODeviceWeight, config_parse_io_device_weight, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOReadBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteBandwidthMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOReadIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IOWriteIOPSMax, config_parse_io_limit, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IODeviceLatencyTargetSec, config_parse_io_device_latency, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.blockio_accounting)
+{{type}}.BlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.blockio_weight)
+{{type}}.StartupBlockIOWeight, config_parse_blockio_weight, 0, offsetof({{type}}, cgroup_context.startup_blockio_weight)
+{{type}}.BlockIODeviceWeight, config_parse_blockio_device_weight, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOReadBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context)
+{{type}}.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, offsetof({{type}}, cgroup_context)
+{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting)
+{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max)
+{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context)
+{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context)
+{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting)
+{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow)
+{{type}}.IPAddressDeny, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_deny)
+{{type}}.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_ingress)
+{{type}}.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof({{type}}, cgroup_context.ip_filters_egress)
+{{type}}.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_swap)
+{{type}}.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure)
+{{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit)
+{{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference)
+{{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0
+{{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context)
+{{type}}.SocketBindAllow, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_allow)
+{{type}}.SocketBindDeny, config_parse_cgroup_socket_bind, 0, offsetof({{type}}, cgroup_context.socket_bind_deny)
+{{type}}.RestrictNetworkInterfaces, config_parse_restrict_network_interfaces, 0, offsetof({{type}}, cgroup_context)
+{{type}}.MemoryPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.memory_pressure_threshold_usec)
+{{type}}.MemoryPressureWatch, config_parse_memory_pressure_watch, 0, offsetof({{type}}, cgroup_context.memory_pressure_watch)
+{{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context)
+{{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive)
+{%- endmacro -%}
+
+%{
+#if __GNUC__ >= 7
+_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+#endif
+#include <stddef.h>
+#include "all-units.h"
+#include "conf-parser.h"
+#include "image-policy.h"
+#include "in-addr-prefix-util.h"
+#include "load-fragment.h"
+%}
+struct ConfigPerfItem;
+%null_strings
+%language=ANSI-C
+%define slot-name section_and_lvalue
+%define hash-function-name load_fragment_gperf_hash
+%define lookup-function-name load_fragment_gperf_lookup
+%readonly-tables
+%omit-struct-type
+%struct-type
+%includes
+%%
+Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description)
+Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation)
+Unit.SourcePath, config_parse_unit_path_printf, 0, offsetof(Unit, source_path)
+Unit.Requires, config_parse_unit_deps, UNIT_REQUIRES, 0
+Unit.Requisite, config_parse_unit_deps, UNIT_REQUISITE, 0
+Unit.Wants, config_parse_unit_deps, UNIT_WANTS, 0
+Unit.BindsTo, config_parse_unit_deps, UNIT_BINDS_TO, 0
+Unit.BindTo, config_parse_unit_deps, UNIT_BINDS_TO, 0
+Unit.Upholds, config_parse_unit_deps, UNIT_UPHOLDS, 0
+Unit.Conflicts, config_parse_unit_deps, UNIT_CONFLICTS, 0
+Unit.Before, config_parse_unit_deps, UNIT_BEFORE, 0
+Unit.After, config_parse_unit_deps, UNIT_AFTER, 0
+Unit.OnSuccess, config_parse_unit_deps, UNIT_ON_SUCCESS, 0
+Unit.OnFailure, config_parse_unit_deps, UNIT_ON_FAILURE, 0
+Unit.PropagatesReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0
+Unit.PropagateReloadTo, config_parse_unit_deps, UNIT_PROPAGATES_RELOAD_TO, 0
+Unit.ReloadPropagatedFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0
+Unit.PropagateReloadFrom, config_parse_unit_deps, UNIT_RELOAD_PROPAGATED_FROM, 0
+Unit.PropagatesStopTo, config_parse_unit_deps, UNIT_PROPAGATES_STOP_TO, 0
+Unit.StopPropagatedFrom, config_parse_unit_deps, UNIT_STOP_PROPAGATED_FROM, 0
+Unit.PartOf, config_parse_unit_deps, UNIT_PART_OF, 0
+Unit.JoinsNamespaceOf, config_parse_unit_deps, UNIT_JOINS_NAMESPACE_OF, 0
+Unit.RequiresOverridable, config_parse_obsolete_unit_deps, UNIT_REQUIRES, 0
+Unit.RequisiteOverridable, config_parse_obsolete_unit_deps, UNIT_REQUISITE, 0
+Unit.RequiresMountsFor, config_parse_unit_requires_mounts_for, 0, 0
+Unit.StopWhenUnneeded, config_parse_bool, 0, offsetof(Unit, stop_when_unneeded)
+Unit.RefuseManualStart, config_parse_bool, 0, offsetof(Unit, refuse_manual_start)
+Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
+Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate)
+Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies)
+Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal)
+Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode)
+Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode)
+{# The following is a legacy alias name for compatibility #}
+Unit.OnFailureIsolate, config_parse_job_mode_isolate, 0, offsetof(Unit, on_failure_job_mode)
+Unit.IgnoreOnIsolate, config_parse_bool, 0, offsetof(Unit, ignore_on_isolate)
+Unit.IgnoreOnSnapshot, config_parse_warn_compat, DISABLED_LEGACY, 0
+Unit.JobTimeoutSec, config_parse_job_timeout_sec, 0, 0
+Unit.JobRunningTimeoutSec, config_parse_job_running_timeout_sec, 0, 0
+Unit.JobTimeoutAction, config_parse_emergency_action, 0, offsetof(Unit, job_timeout_action)
+Unit.JobTimeoutRebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, job_timeout_reboot_arg)
+Unit.StartLimitIntervalSec, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+{# The following is a legacy alias name for compatibility #}
+Unit.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+Unit.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst)
+Unit.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action)
+Unit.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action)
+Unit.SuccessAction, config_parse_emergency_action, 0, offsetof(Unit, success_action)
+Unit.FailureActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, failure_action_exit_status)
+Unit.SuccessActionExitStatus, config_parse_exit_status, 0, offsetof(Unit, success_action_exit_status)
+Unit.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg)
+Unit.ConditionPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, conditions)
+Unit.ConditionPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, conditions)
+Unit.ConditionPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, conditions)
+Unit.ConditionPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, conditions)
+Unit.ConditionPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, conditions)
+Unit.ConditionPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, conditions)
+Unit.ConditionPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, conditions)
+Unit.ConditionDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, conditions)
+Unit.ConditionFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, conditions)
+Unit.ConditionFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, conditions)
+Unit.ConditionNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, conditions)
+Unit.ConditionFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, conditions)
+Unit.ConditionArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, conditions)
+Unit.ConditionFirmware, config_parse_unit_condition_string, CONDITION_FIRMWARE, offsetof(Unit, conditions)
+Unit.ConditionVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, conditions)
+Unit.ConditionHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, conditions)
+Unit.ConditionKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, conditions)
+Unit.ConditionKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, conditions)
+Unit.ConditionCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, conditions)
+Unit.ConditionSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, conditions)
+Unit.ConditionCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, conditions)
+Unit.ConditionACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, conditions)
+Unit.ConditionMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, conditions)
+Unit.ConditionCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, conditions)
+Unit.ConditionCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, conditions)
+Unit.ConditionEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, conditions)
+Unit.ConditionUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, conditions)
+Unit.ConditionGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, conditions)
+Unit.ConditionControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, conditions)
+Unit.ConditionOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, conditions)
+Unit.ConditionMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, conditions)
+Unit.ConditionCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, conditions)
+Unit.ConditionIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, conditions)
+Unit.AssertPathExists, config_parse_unit_condition_path, CONDITION_PATH_EXISTS, offsetof(Unit, asserts)
+Unit.AssertPathExistsGlob, config_parse_unit_condition_path, CONDITION_PATH_EXISTS_GLOB, offsetof(Unit, asserts)
+Unit.AssertPathIsDirectory, config_parse_unit_condition_path, CONDITION_PATH_IS_DIRECTORY, offsetof(Unit, asserts)
+Unit.AssertPathIsSymbolicLink, config_parse_unit_condition_path, CONDITION_PATH_IS_SYMBOLIC_LINK, offsetof(Unit, asserts)
+Unit.AssertPathIsMountPoint, config_parse_unit_condition_path, CONDITION_PATH_IS_MOUNT_POINT, offsetof(Unit, asserts)
+Unit.AssertPathIsReadWrite, config_parse_unit_condition_path, CONDITION_PATH_IS_READ_WRITE, offsetof(Unit, asserts)
+Unit.AssertPathIsEncrypted, config_parse_unit_condition_path, CONDITION_PATH_IS_ENCRYPTED, offsetof(Unit, asserts)
+Unit.AssertDirectoryNotEmpty, config_parse_unit_condition_path, CONDITION_DIRECTORY_NOT_EMPTY, offsetof(Unit, asserts)
+Unit.AssertFileNotEmpty, config_parse_unit_condition_path, CONDITION_FILE_NOT_EMPTY, offsetof(Unit, asserts)
+Unit.AssertFileIsExecutable, config_parse_unit_condition_path, CONDITION_FILE_IS_EXECUTABLE, offsetof(Unit, asserts)
+Unit.AssertNeedsUpdate, config_parse_unit_condition_path, CONDITION_NEEDS_UPDATE, offsetof(Unit, asserts)
+Unit.AssertFirstBoot, config_parse_unit_condition_string, CONDITION_FIRST_BOOT, offsetof(Unit, asserts)
+Unit.AssertArchitecture, config_parse_unit_condition_string, CONDITION_ARCHITECTURE, offsetof(Unit, asserts)
+Unit.AssertVirtualization, config_parse_unit_condition_string, CONDITION_VIRTUALIZATION, offsetof(Unit, asserts)
+Unit.AssertHost, config_parse_unit_condition_string, CONDITION_HOST, offsetof(Unit, asserts)
+Unit.AssertKernelCommandLine, config_parse_unit_condition_string, CONDITION_KERNEL_COMMAND_LINE, offsetof(Unit, asserts)
+Unit.AssertKernelVersion, config_parse_unit_condition_string, CONDITION_KERNEL_VERSION, offsetof(Unit, asserts)
+Unit.AssertCredential, config_parse_unit_condition_string, CONDITION_CREDENTIAL, offsetof(Unit, asserts)
+Unit.AssertSecurity, config_parse_unit_condition_string, CONDITION_SECURITY, offsetof(Unit, asserts)
+Unit.AssertCapability, config_parse_unit_condition_string, CONDITION_CAPABILITY, offsetof(Unit, asserts)
+Unit.AssertACPower, config_parse_unit_condition_string, CONDITION_AC_POWER, offsetof(Unit, asserts)
+Unit.AssertMemory, config_parse_unit_condition_string, CONDITION_MEMORY, offsetof(Unit, asserts)
+Unit.AssertCPUFeature, config_parse_unit_condition_string, CONDITION_CPU_FEATURE, offsetof(Unit, asserts)
+Unit.AssertCPUs, config_parse_unit_condition_string, CONDITION_CPUS, offsetof(Unit, asserts)
+Unit.AssertEnvironment, config_parse_unit_condition_string, CONDITION_ENVIRONMENT, offsetof(Unit, asserts)
+Unit.AssertUser, config_parse_unit_condition_string, CONDITION_USER, offsetof(Unit, asserts)
+Unit.AssertGroup, config_parse_unit_condition_string, CONDITION_GROUP, offsetof(Unit, asserts)
+Unit.AssertControlGroupController, config_parse_unit_condition_string, CONDITION_CONTROL_GROUP_CONTROLLER, offsetof(Unit, asserts)
+Unit.AssertOSRelease, config_parse_unit_condition_string, CONDITION_OS_RELEASE, offsetof(Unit, asserts)
+Unit.AssertMemoryPressure, config_parse_unit_condition_string, CONDITION_MEMORY_PRESSURE, offsetof(Unit, asserts)
+Unit.AssertCPUPressure, config_parse_unit_condition_string, CONDITION_CPU_PRESSURE, offsetof(Unit, asserts)
+Unit.AssertIOPressure, config_parse_unit_condition_string, CONDITION_IO_PRESSURE, offsetof(Unit, asserts)
+Unit.CollectMode, config_parse_collect_mode, 0, offsetof(Unit, collect_mode)
+Service.PIDFile, config_parse_pid_file, 0, offsetof(Service, pid_file)
+Service.ExecCondition, config_parse_exec, SERVICE_EXEC_CONDITION, offsetof(Service, exec_command)
+Service.ExecStartPre, config_parse_exec, SERVICE_EXEC_START_PRE, offsetof(Service, exec_command)
+Service.ExecStart, config_parse_exec, SERVICE_EXEC_START, offsetof(Service, exec_command)
+Service.ExecStartPost, config_parse_exec, SERVICE_EXEC_START_POST, offsetof(Service, exec_command)
+Service.ExecReload, config_parse_exec, SERVICE_EXEC_RELOAD, offsetof(Service, exec_command)
+Service.ExecStop, config_parse_exec, SERVICE_EXEC_STOP, offsetof(Service, exec_command)
+Service.ExecStopPost, config_parse_exec, SERVICE_EXEC_STOP_POST, offsetof(Service, exec_command)
+Service.RestartSec, config_parse_sec, 0, offsetof(Service, restart_usec)
+Service.RestartSteps, config_parse_unsigned, 0, offsetof(Service, restart_steps)
+Service.RestartMaxDelaySec, config_parse_sec, 0, offsetof(Service, restart_max_delay_usec)
+Service.TimeoutSec, config_parse_service_timeout, 0, 0
+Service.TimeoutStartSec, config_parse_service_timeout, 0, 0
+Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec)
+Service.TimeoutAbortSec, config_parse_service_timeout_abort, 0, 0
+Service.TimeoutStartFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_start_failure_mode)
+Service.TimeoutStopFailureMode, config_parse_service_timeout_failure_mode, 0, offsetof(Service, timeout_stop_failure_mode)
+Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec)
+Service.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Service, runtime_rand_extra_usec)
+Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec)
+{# The following five only exist for compatibility, they moved into Unit, see above #}
+Service.StartLimitInterval, config_parse_sec, 0, offsetof(Unit, start_ratelimit.interval)
+Service.StartLimitBurst, config_parse_unsigned, 0, offsetof(Unit, start_ratelimit.burst)
+Service.StartLimitAction, config_parse_emergency_action, 0, offsetof(Unit, start_limit_action)
+Service.FailureAction, config_parse_emergency_action, 0, offsetof(Unit, failure_action)
+Service.RebootArgument, config_parse_unit_string_printf, 0, offsetof(Unit, reboot_arg)
+Service.Type, config_parse_service_type, 0, offsetof(Service, type)
+Service.ExitType, config_parse_service_exit_type, 0, offsetof(Service, exit_type)
+Service.Restart, config_parse_service_restart, 0, offsetof(Service, restart)
+Service.RestartMode, config_parse_service_restart_mode, 0, offsetof(Service, restart_mode)
+Service.PermissionsStartOnly, config_parse_bool, 0, offsetof(Service, permissions_start_only)
+Service.RootDirectoryStartOnly, config_parse_bool, 0, offsetof(Service, root_directory_start_only)
+Service.RemainAfterExit, config_parse_bool, 0, offsetof(Service, remain_after_exit)
+Service.GuessMainPID, config_parse_bool, 0, offsetof(Service, guess_main_pid)
+Service.RestartPreventExitStatus, config_parse_set_status, 0, offsetof(Service, restart_prevent_status)
+Service.RestartForceExitStatus, config_parse_set_status, 0, offsetof(Service, restart_force_status)
+Service.SuccessExitStatus, config_parse_set_status, 0, offsetof(Service, success_status)
+Service.SysVStartPriority, config_parse_warn_compat, DISABLED_LEGACY, 0
+Service.NonBlocking, config_parse_bool, 0, offsetof(Service, exec_context.non_blocking)
+Service.BusName, config_parse_bus_name, 0, offsetof(Service, bus_name)
+Service.FileDescriptorStoreMax, config_parse_unsigned, 0, offsetof(Service, n_fd_store_max)
+Service.FileDescriptorStorePreserve, config_parse_exec_preserve_mode, 0, offsetof(Service, fd_store_preserve_mode)
+Service.NotifyAccess, config_parse_notify_access, 0, offsetof(Service, notify_access)
+Service.Sockets, config_parse_service_sockets, 0, 0
+Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0
+Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors)
+Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings)
+Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy)
+Service.OpenFile, config_parse_open_file, 0, offsetof(Service, open_files)
+Service.ReloadSignal, config_parse_signal, 0, offsetof(Service, reload_signal)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Service') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Service') }}
+Socket.ListenStream, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenDatagram, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenSequentialPacket, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenFIFO, config_parse_socket_listen, SOCKET_FIFO, 0
+Socket.ListenNetlink, config_parse_socket_listen, SOCKET_SOCKET, 0
+Socket.ListenSpecial, config_parse_socket_listen, SOCKET_SPECIAL, 0
+Socket.ListenMessageQueue, config_parse_socket_listen, SOCKET_MQUEUE, 0
+Socket.ListenUSBFunction, config_parse_socket_listen, SOCKET_USB_FUNCTION, 0
+Socket.SocketProtocol, config_parse_socket_protocol, 0, offsetof(Socket, socket_protocol)
+Socket.BindIPv6Only, config_parse_socket_bind, 0, offsetof(Socket, bind_ipv6_only)
+Socket.Backlog, config_parse_unsigned, 0, offsetof(Socket, backlog)
+Socket.BindToDevice, config_parse_socket_bindtodevice, 0, 0
+Socket.ExecStartPre, config_parse_exec, SOCKET_EXEC_START_PRE, offsetof(Socket, exec_command)
+Socket.ExecStartPost, config_parse_exec, SOCKET_EXEC_START_POST, offsetof(Socket, exec_command)
+Socket.ExecStopPre, config_parse_exec, SOCKET_EXEC_STOP_PRE, offsetof(Socket, exec_command)
+Socket.ExecStopPost, config_parse_exec, SOCKET_EXEC_STOP_POST, offsetof(Socket, exec_command)
+Socket.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Socket, timeout_usec)
+Socket.SocketUser, config_parse_user_group_compat, 0, offsetof(Socket, user)
+Socket.SocketGroup, config_parse_user_group_compat, 0, offsetof(Socket, group)
+Socket.SocketMode, config_parse_mode, 0, offsetof(Socket, socket_mode)
+Socket.DirectoryMode, config_parse_mode, 0, offsetof(Socket, directory_mode)
+Socket.Accept, config_parse_bool, 0, offsetof(Socket, accept)
+Socket.FlushPending, config_parse_bool, 0, offsetof(Socket, flush_pending)
+Socket.Writable, config_parse_bool, 0, offsetof(Socket, writable)
+Socket.MaxConnections, config_parse_unsigned, 0, offsetof(Socket, max_connections)
+Socket.MaxConnectionsPerSource, config_parse_unsigned, 0, offsetof(Socket, max_connections_per_source)
+Socket.KeepAlive, config_parse_bool, 0, offsetof(Socket, keep_alive)
+Socket.KeepAliveTimeSec, config_parse_sec, 0, offsetof(Socket, keep_alive_time)
+Socket.KeepAliveIntervalSec, config_parse_sec, 0, offsetof(Socket, keep_alive_interval)
+Socket.KeepAliveProbes, config_parse_unsigned, 0, offsetof(Socket, keep_alive_cnt)
+Socket.DeferAcceptSec, config_parse_sec, 0, offsetof(Socket, defer_accept)
+Socket.NoDelay, config_parse_bool, 0, offsetof(Socket, no_delay)
+Socket.Priority, config_parse_int, 0, offsetof(Socket, priority)
+Socket.ReceiveBuffer, config_parse_iec_size, 0, offsetof(Socket, receive_buffer)
+Socket.SendBuffer, config_parse_iec_size, 0, offsetof(Socket, send_buffer)
+Socket.IPTOS, config_parse_ip_tos, 0, offsetof(Socket, ip_tos)
+Socket.IPTTL, config_parse_int, 0, offsetof(Socket, ip_ttl)
+Socket.Mark, config_parse_int, 0, offsetof(Socket, mark)
+Socket.PipeSize, config_parse_iec_size, 0, offsetof(Socket, pipe_size)
+Socket.FreeBind, config_parse_bool, 0, offsetof(Socket, free_bind)
+Socket.Transparent, config_parse_bool, 0, offsetof(Socket, transparent)
+Socket.Broadcast, config_parse_bool, 0, offsetof(Socket, broadcast)
+Socket.PassCredentials, config_parse_bool, 0, offsetof(Socket, pass_cred)
+Socket.PassSecurity, config_parse_bool, 0, offsetof(Socket, pass_sec)
+Socket.PassPacketInfo, config_parse_bool, 0, offsetof(Socket, pass_pktinfo)
+Socket.Timestamping, config_parse_socket_timestamping, 0, offsetof(Socket, timestamping)
+Socket.TCPCongestion, config_parse_string, 0, offsetof(Socket, tcp_congestion)
+Socket.ReusePort, config_parse_bool, 0, offsetof(Socket, reuse_port)
+Socket.MessageQueueMaxMessages, config_parse_long, 0, offsetof(Socket, mq_maxmsg)
+Socket.MessageQueueMessageSize, config_parse_long, 0, offsetof(Socket, mq_msgsize)
+Socket.RemoveOnStop, config_parse_bool, 0, offsetof(Socket, remove_on_stop)
+Socket.Symlinks, config_parse_unit_path_strv_printf, 0, offsetof(Socket, symlinks)
+Socket.FileDescriptorName, config_parse_fdname, 0, 0
+Socket.Service, config_parse_socket_service, 0, 0
+Socket.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, trigger_limit.interval)
+Socket.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Socket, trigger_limit.burst)
+Socket.PollLimitIntervalSec, config_parse_sec, 0, offsetof(Socket, poll_limit_interval)
+Socket.PollLimitBurst, config_parse_unsigned, 0, offsetof(Socket, poll_limit_burst)
+{% if ENABLE_SMACK %}
+Socket.SmackLabel, config_parse_unit_string_printf, 0, offsetof(Socket, smack)
+Socket.SmackLabelIPIn, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_in)
+Socket.SmackLabelIPOut, config_parse_unit_string_printf, 0, offsetof(Socket, smack_ip_out)
+{% else %}
+Socket.SmackLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+Socket.SmackLabelIPIn, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+Socket.SmackLabelIPOut, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{% if HAVE_SELINUX %}
+Socket.SELinuxContextFromNet, config_parse_bool, 0, offsetof(Socket, selinux_context_from_net)
+{% else %}
+Socket.SELinuxContextFromNet, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
+{% endif %}
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Socket') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Socket') }}
+Mount.What, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.what)
+Mount.Where, config_parse_unit_path_printf, 0, offsetof(Mount, where)
+Mount.Options, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.options)
+Mount.Type, config_parse_unit_string_printf, 0, offsetof(Mount, parameters_fragment.fstype)
+Mount.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Mount, timeout_usec)
+Mount.DirectoryMode, config_parse_mode, 0, offsetof(Mount, directory_mode)
+Mount.SloppyOptions, config_parse_bool, 0, offsetof(Mount, sloppy_options)
+Mount.LazyUnmount, config_parse_bool, 0, offsetof(Mount, lazy_unmount)
+Mount.ForceUnmount, config_parse_bool, 0, offsetof(Mount, force_unmount)
+Mount.ReadWriteOnly, config_parse_bool, 0, offsetof(Mount, read_write_only)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Mount') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Mount') }}
+Automount.Where, config_parse_unit_path_printf, 0, offsetof(Automount, where)
+Automount.ExtraOptions, config_parse_unit_string_printf, 0, offsetof(Automount, extra_options)
+Automount.DirectoryMode, config_parse_mode, 0, offsetof(Automount, directory_mode)
+Automount.TimeoutIdleSec, config_parse_sec_fix_0, 0, offsetof(Automount, timeout_idle_usec)
+Swap.What, config_parse_unit_path_printf, 0, offsetof(Swap, parameters_fragment.what)
+Swap.Priority, config_parse_swap_priority, 0, 0
+Swap.Options, config_parse_unit_string_printf, 0, offsetof(Swap, parameters_fragment.options)
+Swap.TimeoutSec, config_parse_sec_fix_0, 0, offsetof(Swap, timeout_usec)
+{{ EXEC_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Swap') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Swap') }}
+Timer.OnCalendar, config_parse_timer, TIMER_CALENDAR, 0
+Timer.OnActiveSec, config_parse_timer, TIMER_ACTIVE, 0
+Timer.OnBootSec, config_parse_timer, TIMER_BOOT, 0
+Timer.OnStartupSec, config_parse_timer, TIMER_STARTUP, 0
+Timer.OnUnitActiveSec, config_parse_timer, TIMER_UNIT_ACTIVE, 0
+Timer.OnUnitInactiveSec, config_parse_timer, TIMER_UNIT_INACTIVE, 0
+Timer.OnClockChange, config_parse_bool, 0, offsetof(Timer, on_clock_change)
+Timer.OnTimezoneChange, config_parse_bool, 0, offsetof(Timer, on_timezone_change)
+Timer.Persistent, config_parse_bool, 0, offsetof(Timer, persistent)
+Timer.WakeSystem, config_parse_bool, 0, offsetof(Timer, wake_system)
+Timer.RemainAfterElapse, config_parse_bool, 0, offsetof(Timer, remain_after_elapse)
+Timer.FixedRandomDelay, config_parse_bool, 0, offsetof(Timer, fixed_random_delay)
+Timer.AccuracySec, config_parse_sec, 0, offsetof(Timer, accuracy_usec)
+Timer.RandomizedDelaySec, config_parse_sec, 0, offsetof(Timer, random_usec)
+Timer.Unit, config_parse_trigger_unit, 0, 0
+Path.PathExists, config_parse_path_spec, 0, 0
+Path.PathExistsGlob, config_parse_path_spec, 0, 0
+Path.PathChanged, config_parse_path_spec, 0, 0
+Path.PathModified, config_parse_path_spec, 0, 0
+Path.DirectoryNotEmpty, config_parse_path_spec, 0, 0
+Path.Unit, config_parse_trigger_unit, 0, 0
+Path.MakeDirectory, config_parse_bool, 0, offsetof(Path, make_directory)
+Path.DirectoryMode, config_parse_mode, 0, offsetof(Path, directory_mode)
+Path.TriggerLimitIntervalSec, config_parse_sec, 0, offsetof(Path, trigger_limit.interval)
+Path.TriggerLimitBurst, config_parse_unsigned, 0, offsetof(Path, trigger_limit.burst)
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Slice') }}
+{{ CGROUP_CONTEXT_CONFIG_ITEMS('Scope') }}
+{{ KILL_CONTEXT_CONFIG_ITEMS('Scope') }}
+Scope.RuntimeMaxSec, config_parse_sec, 0, offsetof(Scope, runtime_max_usec)
+Scope.RuntimeRandomizedExtraSec, config_parse_sec, 0, offsetof(Scope, runtime_rand_extra_usec)
+Scope.TimeoutStopSec, config_parse_sec, 0, offsetof(Scope, timeout_stop_usec)
+Scope.OOMPolicy, config_parse_oom_policy, 0, offsetof(Scope, oom_policy)
+{# The [Install] section is ignored here #}
+Install.Alias, NULL, 0, 0
+Install.WantedBy, NULL, 0, 0
+Install.RequiredBy, NULL, 0, 0
+Install.UpheldBy, NULL, 0, 0
+Install.Also, NULL, 0, 0
+Install.DefaultInstance, NULL, 0, 0
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
new file mode 100644
index 0000000..0baf08e
--- /dev/null
+++ b/src/core/load-fragment.c
@@ -0,0 +1,6735 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2012 Holger Hans Peter Freyther
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <linux/oom.h>
+#include <sched.h>
+#include <sys/resource.h>
+
+#include "sd-messages.h"
+
+#include "af-list.h"
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-lsm.h"
+#include "bpf-program.h"
+#include "bpf-socket-bind.h"
+#include "bus-error.h"
+#include "bus-internal.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "capability-util.h"
+#include "cgroup-setup.h"
+#include "conf-parser.h"
+#include "core-varlink.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "iovec-util.h"
+#include "ioprio-util.h"
+#include "ip-protocol-list.h"
+#include "journal-file.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "missing_ioprio.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "open-file.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pcre2-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-netlink.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "time-util.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "user-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+static int parse_socket_protocol(const char *s) {
+ int r;
+
+ r = parse_ip_protocol(s);
+ if (r < 0)
+ return r;
+ if (!IN_SET(r, IPPROTO_UDPLITE, IPPROTO_SCTP))
+ return -EPROTONOSUPPORT;
+
+ return r;
+}
+
+int parse_crash_chvt(const char *value, int *data) {
+ int b;
+
+ if (safe_atoi(value, data) >= 0)
+ return 0;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ return b;
+
+ if (b > 0)
+ *data = 0; /* switch to where kmsg goes */
+ else
+ *data = -1; /* turn off switching */
+
+ return 0;
+}
+
+int parse_confirm_spawn(const char *value, char **console) {
+ char *s;
+ int r;
+
+ r = value ? parse_boolean(value) : 1;
+ if (r == 0) {
+ *console = NULL;
+ return 0;
+ } else if (r > 0) /* on with default tty */
+ s = strdup("/dev/console");
+ else if (is_path(value)) /* on with fully qualified path */
+ s = strdup(value);
+ else /* on with only a tty file name, not a fully qualified path */
+ s = path_join("/dev/", value);
+ if (!s)
+ return -ENOMEM;
+
+ *console = s;
+ return 0;
+}
+
+DEFINE_CONFIG_PARSE(config_parse_socket_protocol, parse_socket_protocol, "Failed to parse socket protocol");
+DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Failed to parse secure bits");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_home, protect_home, ProtectHome, "Failed to parse protect home value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_system, protect_system, ProtectSystem, "Failed to parse protect system value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode, "Failed to parse resource preserve mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_exit_type, service_exit_type, ServiceExitType, "Failed to parse service exit type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart_mode, service_restart_mode, ServiceRestartMode, "Failed to parse service restart mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_memory_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch, "Failed to parse memory pressure watch setting");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
+DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
+DEFINE_CONFIG_PARSE_PTR(config_parse_cg_cpu_weight, cg_cpu_weight_parse, uint64_t, "Invalid CPU weight");
+static DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares_internal, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
+DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_propagation_flag, mount_propagation_flag_from_string, unsigned long, "Failed to parse mount propagation flag");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat, "Failed to parse status unit format");
+DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping, "Failed to parse timestamping precision");
+
+int config_parse_cpu_shares(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use CPUWeight= instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ return config_parse_cpu_shares_internal(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+bool contains_instance_specifier_superset(const char *s) {
+ const char *p, *q;
+ bool percent = false;
+
+ assert(s);
+
+ p = strchr(s, '@');
+ if (!p)
+ return false;
+
+ p++; /* Skip '@' */
+
+ q = strrchr(p, '.');
+ if (!q)
+ q = p + strlen(p);
+
+ /* If the string is just the instance specifier, it's not a superset of the instance. */
+ if (memcmp_nn(p, q - p, "%i", strlen("%i")) == 0)
+ return false;
+
+ /* %i, %n and %N all expand to the instance or a superset of it. */
+ for (; p < q; p++)
+ if (*p == '%')
+ percent = !percent;
+ else if (percent) {
+ if (IN_SET(*p, 'n', 'N', 'i'))
+ return true;
+ percent = false;
+ }
+
+ return false;
+}
+
+/* `name` is the rendered version of `format` via `unit_printf` or similar functions. */
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format) {
+ const char *fragment_path;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ /* If a template unit has a direct dependency on itself that includes the unit instance as part of
+ * the template instance via a unit specifier (%i, %n or %N), this will almost certainly lead to
+ * infinite recursion as systemd will keep instantiating new instances of the template unit.
+ * https://github.com/systemd/systemd/issues/17602 shows a good example of how this can happen in
+ * practice. To guard against this, we check for templates that depend on themselves and have the
+ * instantiated unit instance included as part of the template instance of the dependency via a
+ * specifier.
+ *
+ * For example, if systemd-notify@.service depends on systemd-notify@%n.service, this will result in
+ * infinite recursion.
+ */
+
+ if (!unit_name_is_valid(name, UNIT_NAME_INSTANCE))
+ return false;
+
+ if (!unit_name_prefix_equal(u->id, name))
+ return false;
+
+ if (u->type != unit_name_to_type(name))
+ return false;
+
+ r = unit_file_find_fragment(u->manager->unit_id_map, u->manager->unit_name_map, name, &fragment_path, NULL);
+ if (r < 0)
+ return r;
+
+ /* Fragment paths should also be equal as a custom fragment for a specific template instance
+ * wouldn't necessarily lead to infinite recursion. */
+ if (!path_equal_ptr(u->fragment_path, fragment_path))
+ return false;
+
+ if (!contains_instance_specifier_superset(format))
+ return false;
+
+ return true;
+}
+
+int config_parse_unit_deps(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ UnitDependency d = ltype;
+ Unit *u = userdata;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ int r;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_name_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ r = unit_is_likely_recursive_template_dependency(u, k, word);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to determine if '%s' is a recursive dependency, ignoring: %m", k);
+ continue;
+ }
+ if (r > 0) {
+ log_syntax(unit, LOG_DEBUG, filename, line, 0,
+ "Dropping dependency %s=%s that likely leads to infinite recursion.",
+ unit_dependency_to_string(d), word);
+ continue;
+ }
+
+ r = unit_add_dependency_by_name(u, d, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+ }
+}
+
+int config_parse_obsolete_unit_deps(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit dependency type %s= is obsolete, replacing by %s=, please update your unit file", lvalue, unit_dependency_to_string(ltype));
+
+ return config_parse_unit_deps(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_unit_string_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_strv_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = ASSERT_PTR(userdata);
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ return config_parse_strv(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_unit_path_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+ bool fatal = ltype;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, fatal ? "" : ", ignoring");
+ return fatal ? -ENOEXEC : 0;
+ }
+
+ return config_parse_path(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_colon_separated_paths(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ char ***sv = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ r = unit_path_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_consume(sv, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_unit_path_strv_printf(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***x = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *x = strv_free(*x);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_consume(x, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+static int patch_var_run(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue,
+ char **path) {
+
+ const char *e;
+ char *z;
+
+ e = path_startswith(*path, "/var/run/");
+ if (!e)
+ return 0;
+
+ z = path_join("/run/", e);
+ if (!z)
+ return log_oom();
+
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "%s= references a path below legacy directory /var/run/, updating %s → %s; "
+ "please update the unit file accordingly.", lvalue, *path, z);
+
+ free_and_replace(*path, z);
+
+ return 1;
+}
+
+int config_parse_socket_listen(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ SocketPort *p = NULL;
+ SocketPort *tail;
+ Socket *s;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ s = SOCKET(data);
+
+ if (isempty(rvalue)) {
+ /* An empty assignment removes all ports */
+ socket_free_ports(s);
+ return 0;
+ }
+
+ p = new0(SocketPort, 1);
+ if (!p)
+ return log_oom();
+
+ if (ltype != SOCKET_SOCKET) {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (ltype == SOCKET_FIFO) {
+ r = patch_var_run(unit, filename, line, lvalue, &k);
+ if (r < 0)
+ return r;
+ }
+
+ free_and_replace(p->path, k);
+ p->type = ltype;
+
+ } else if (streq(lvalue, "ListenNetlink")) {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = socket_address_parse_netlink(&p->address, k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+ return 0;
+ }
+
+ p->type = SOCKET_SOCKET;
+
+ } else {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(UNIT(s), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (k[0] == '/') { /* Only for AF_UNIX file system sockets… */
+ r = patch_var_run(unit, filename, line, lvalue, &k);
+ if (r < 0)
+ return r;
+ }
+
+ r = socket_address_parse_and_warn(&p->address, k);
+ if (r < 0) {
+ if (r != -EAFNOSUPPORT)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse address value in '%s', ignoring: %m", k);
+ return 0;
+ }
+
+ if (streq(lvalue, "ListenStream"))
+ p->address.type = SOCK_STREAM;
+ else if (streq(lvalue, "ListenDatagram"))
+ p->address.type = SOCK_DGRAM;
+ else {
+ assert(streq(lvalue, "ListenSequentialPacket"));
+ p->address.type = SOCK_SEQPACKET;
+ }
+
+ if (socket_address_family(&p->address) != AF_UNIX && p->address.type == SOCK_SEQPACKET) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Address family not supported, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ p->type = SOCKET_SOCKET;
+ }
+
+ p->fd = -EBADF;
+ p->auxiliary_fds = NULL;
+ p->n_auxiliary_fds = 0;
+ p->socket = s;
+
+ tail = LIST_FIND_TAIL(port, s->ports);
+ LIST_INSERT_AFTER(port, s->ports, tail, p);
+
+ p = NULL;
+
+ return 0;
+}
+
+int config_parse_exec_nice(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int priority, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->nice_set = false;
+ return 0;
+ }
+
+ r = parse_nice(rvalue, &priority);
+ if (r < 0) {
+ if (r == -ERANGE)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Nice priority out of range, ignoring: %s", rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse nice priority '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->nice = priority;
+ c->nice_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_oom_score_adjust(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int oa, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->oom_score_adjust_set = false;
+ return 0;
+ }
+
+ r = parse_oom_score_adjust(rvalue, &oa);
+ if (r < 0) {
+ if (r == -ERANGE)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "OOM score adjust value out of range, ignoring: %s", rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->oom_score_adjust = oa;
+ c->oom_score_adjust_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_coredump_filter(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->coredump_filter = 0;
+ c->coredump_filter_set = false;
+ return 0;
+ }
+
+ uint64_t f;
+ r = coredump_filter_mask_from_string(rvalue, &f);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse the CoredumpFilter=%s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c->coredump_filter |= f;
+ c->coredump_filter_set = true;
+ return 0;
+}
+
+int config_parse_kill_mode(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ KillMode *k = data, m;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ *k = KILL_CONTROL_GROUP;
+ return 0;
+ }
+
+ m = kill_mode_from_string(rvalue);
+ if (m < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, m,
+ "Failed to parse kill mode specification, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (m == KILL_NONE)
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses KillMode=none. "
+ "This is unsafe, as it disables systemd's process lifecycle management for the service. "
+ "Please update the service to use a safer KillMode=, such as 'mixed' or 'control-group'. "
+ "Support for KillMode=none is deprecated and will eventually be removed.");
+
+ *k = m;
+ return 0;
+}
+
+int config_parse_exec(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecCommand **e = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ const char *p;
+ bool semicolon;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ e += ltype;
+
+ if (isempty(rvalue)) {
+ /* An empty assignment resets the list */
+ *e = exec_command_free_list(*e);
+ return 0;
+ }
+
+ p = rvalue;
+ do {
+ _cleanup_free_ char *path = NULL, *firstword = NULL;
+ ExecCommandFlags flags = 0;
+ bool ignore = false, separate_argv0 = false;
+ _cleanup_free_ ExecCommand *nce = NULL;
+ _cleanup_strv_free_ char **n = NULL;
+ size_t nlen = 0;
+ const char *f;
+
+ semicolon = false;
+
+ r = extract_first_word_and_warn(&p, &firstword, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+ if (r <= 0)
+ return 0;
+
+ /* A lone ";" is a separator. Let's make sure we don't treat it as an executable name. */
+ if (streq(firstword, ";")) {
+ semicolon = true;
+ continue;
+ }
+
+ f = firstword;
+ for (;;) {
+ /* We accept an absolute path as first argument. If it's prefixed with - and the path doesn't
+ * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of
+ * argv[0]; if it's prefixed with :, we will not do environment variable substitution;
+ * if it's prefixed with +, it will be run with full privileges and no sandboxing; if
+ * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if
+ * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient
+ * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most
+ * other sandboxing, with some special exceptions for changing UID.
+ *
+ * The idea is that '!!' may be used to write services that can take benefit of systemd's
+ * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to
+ * privilege dropping within the daemon if the kernel does not offer that. */
+
+ if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ flags |= EXEC_COMMAND_IGNORE_FAILURE;
+ ignore = true;
+ } else if (*f == '@' && !separate_argv0)
+ separate_argv0 = true;
+ else if (*f == ':' && !(flags & EXEC_COMMAND_NO_ENV_EXPAND))
+ flags |= EXEC_COMMAND_NO_ENV_EXPAND;
+ else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
+ else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+ flags |= EXEC_COMMAND_NO_SETUID;
+ else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) {
+ flags &= ~EXEC_COMMAND_NO_SETUID;
+ flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+ } else
+ break;
+ f++;
+ }
+
+ r = unit_path_printf(u, f, &path);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ f, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (isempty(path)) {
+ /* First word is either "-" or "@" with no command. */
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Empty path in command line%s: '%s'",
+ ignore ? ", ignoring" : "", rvalue);
+ return ignore ? 0 : -ENOEXEC;
+ }
+ if (!string_is_safe(path)) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Executable name contains special characters%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+ if (endswith(path, "/")) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Executable path specifies a directory%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!(path_is_absolute(path) ? path_is_valid(path) : filename_is_valid(path))) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Neither a valid executable name nor an absolute path%s: %s",
+ ignore ? ", ignoring" : "", path);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!separate_argv0) {
+ char *w = NULL;
+
+ if (!GREEDY_REALLOC0(n, nlen + 2))
+ return log_oom();
+
+ w = strdup(path);
+ if (!w)
+ return log_oom();
+ n[nlen++] = w;
+ n[nlen] = NULL;
+ }
+
+ path_simplify(path);
+
+ while (!isempty(p)) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ /* Check explicitly for an unquoted semicolon as
+ * command separator token. */
+ if (p[0] == ';' && (!p[1] || strchr(WHITESPACE, p[1]))) {
+ p++;
+ p += strspn(p, WHITESPACE);
+ semicolon = true;
+ break;
+ }
+
+ /* Check for \; explicitly, to not confuse it with \\; or "\;" or "\\;" etc.
+ * extract_first_word() would return the same for all of those. */
+ if (p[0] == '\\' && p[1] == ';' && (!p[2] || strchr(WHITESPACE, p[2]))) {
+ char *w;
+
+ p += 2;
+ p += strspn(p, WHITESPACE);
+
+ if (!GREEDY_REALLOC0(n, nlen + 2))
+ return log_oom();
+
+ w = strdup(";");
+ if (!w)
+ return log_oom();
+ n[nlen++] = w;
+ n[nlen] = NULL;
+ continue;
+ }
+
+ r = extract_first_word_and_warn(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE, unit, filename, line, rvalue);
+ if (r == 0)
+ break;
+ if (r < 0)
+ return ignore ? 0 : -ENOEXEC;
+
+ r = unit_full_printf(u, word, &resolved);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in %s%s: %m",
+ word, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(resolved);
+ n[nlen] = NULL;
+ }
+
+ if (!n || !n[0]) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, 0,
+ "Empty executable name or zeroeth argument%s: %s",
+ ignore ? ", ignoring" : "", rvalue);
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ nce = new0(ExecCommand, 1);
+ if (!nce)
+ return log_oom();
+
+ nce->argv = TAKE_PTR(n);
+ nce->path = TAKE_PTR(path);
+ nce->flags = flags;
+
+ exec_command_append_list(e, nce);
+
+ /* Do not _cleanup_free_ these. */
+ nce = NULL;
+
+ rvalue = p;
+ } while (semicolon);
+
+ return 0;
+}
+
+int config_parse_socket_bindtodevice(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Socket *s = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue) || streq(rvalue, "*")) {
+ s->bind_to_device = mfree(s->bind_to_device);
+ return 0;
+ }
+
+ if (!ifname_valid(rvalue)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&s->bind_to_device, rvalue);
+}
+
+int config_parse_exec_input(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ const char *n;
+ ExecInput ei;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ n = startswith(rvalue, "fd:");
+ if (n) {
+ _cleanup_free_ char *resolved = NULL;
+
+ r = unit_fd_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+ return 0;
+ }
+
+ if (isempty(resolved))
+ resolved = mfree(resolved);
+ else if (!fdname_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+ return 0;
+ }
+
+ free_and_replace(c->stdio_fdname[STDIN_FILENO], resolved);
+
+ ei = EXEC_INPUT_NAMED_FD;
+
+ } else if ((n = startswith(rvalue, "file:"))) {
+ _cleanup_free_ char *resolved = NULL;
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ free_and_replace(c->stdio_file[STDIN_FILENO], resolved);
+
+ ei = EXEC_INPUT_FILE;
+
+ } else {
+ ei = exec_input_from_string(rvalue);
+ if (ei < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, ei, "Failed to parse input specifier, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ c->std_input = ei;
+ return 0;
+}
+
+int config_parse_exec_input_text(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *unescaped = NULL, *resolved = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+ return 0;
+ }
+
+ ssize_t l = cunescape(rvalue, 0, &unescaped);
+ if (l < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, l,
+ "Failed to decode C escaped text '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, unescaped, EXEC_STDIN_DATA_MAX, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", unescaped);
+ return 0;
+ }
+
+ size_t sz = strlen(resolved);
+ if (c->stdin_data_size + sz + 1 < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz + 1 > EXEC_STDIN_DATA_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+ c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+ return 0;
+ }
+
+ void *p = realloc(c->stdin_data, c->stdin_data_size + sz + 1);
+ if (!p)
+ return log_oom();
+
+ *((char*) mempcpy((char*) p + c->stdin_data_size, resolved, sz)) = '\n';
+
+ c->stdin_data = p;
+ c->stdin_data_size += sz + 1;
+
+ return 0;
+}
+
+int config_parse_exec_input_data(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *p = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t sz;
+ void *q;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->stdin_data = mfree(c->stdin_data);
+ c->stdin_data_size = 0;
+ return 0;
+ }
+
+ r = unbase64mem(rvalue, SIZE_MAX, &p, &sz);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to decode base64 data, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ assert(sz > 0);
+
+ if (c->stdin_data_size + sz < c->stdin_data_size || /* check for overflow */
+ c->stdin_data_size + sz > EXEC_STDIN_DATA_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Standard input data too large (%zu), maximum of %zu permitted, ignoring.",
+ c->stdin_data_size + sz, (size_t) EXEC_STDIN_DATA_MAX);
+ return 0;
+ }
+
+ q = realloc(c->stdin_data, c->stdin_data_size + sz);
+ if (!q)
+ return log_oom();
+
+ memcpy((uint8_t*) q + c->stdin_data_size, p, sz);
+
+ c->stdin_data = q;
+ c->stdin_data_size += sz;
+
+ return 0;
+}
+
+int config_parse_exec_output(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *resolved = NULL;
+ const char *n;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool obsolete = false;
+ ExecOutput eo;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(lvalue);
+ assert(rvalue);
+
+ n = startswith(rvalue, "fd:");
+ if (n) {
+ r = unit_fd_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", n);
+ return 0;
+ }
+
+ if (isempty(resolved))
+ resolved = mfree(resolved);
+ else if (!fdname_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", resolved);
+ return 0;
+ }
+
+ eo = EXEC_OUTPUT_NAMED_FD;
+
+ } else if (streq(rvalue, "syslog")) {
+ eo = EXEC_OUTPUT_JOURNAL;
+ obsolete = true;
+
+ } else if (streq(rvalue, "syslog+console")) {
+ eo = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+ obsolete = true;
+
+ } else if ((n = startswith(rvalue, "file:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE;
+
+ } else if ((n = startswith(rvalue, "append:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE_APPEND;
+
+ } else if ((n = startswith(rvalue, "truncate:"))) {
+
+ r = unit_path_printf(u, n, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", n);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE | PATH_CHECK_FATAL, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ eo = EXEC_OUTPUT_FILE_TRUNCATE;
+ } else {
+ eo = exec_output_from_string(rvalue);
+ if (eo < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, eo, "Failed to parse output specifier, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ if (obsolete)
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "Standard output type %s is obsolete, automatically updating to %s. Please update your unit file, and consider removing the setting altogether.",
+ rvalue, exec_output_to_string(eo));
+
+ if (streq(lvalue, "StandardOutput")) {
+ if (eo == EXEC_OUTPUT_NAMED_FD)
+ free_and_replace(c->stdio_fdname[STDOUT_FILENO], resolved);
+ else
+ free_and_replace(c->stdio_file[STDOUT_FILENO], resolved);
+
+ c->std_output = eo;
+
+ } else {
+ assert(streq(lvalue, "StandardError"));
+
+ if (eo == EXEC_OUTPUT_NAMED_FD)
+ free_and_replace(c->stdio_fdname[STDERR_FILENO], resolved);
+ else
+ free_and_replace(c->stdio_file[STDERR_FILENO], resolved);
+
+ c->std_error = eo;
+ }
+
+ return 0;
+}
+
+int config_parse_exec_io_class(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->ioprio_set = false;
+ c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+ return 0;
+ }
+
+ x = ioprio_class_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse IO scheduling class, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(x, ioprio_prio_data(c->ioprio)));
+ c->ioprio_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_io_priority(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int i, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->ioprio_set = false;
+ c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
+ return 0;
+ }
+
+ r = ioprio_parse_priority(rvalue, &i);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse IO priority, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->ioprio = ioprio_normalize(ioprio_prio_value(ioprio_prio_class(c->ioprio), i));
+ c->ioprio_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_cpu_sched_policy(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->cpu_sched_set = false;
+ c->cpu_sched_policy = SCHED_OTHER;
+ c->cpu_sched_priority = 0;
+ return 0;
+ }
+
+ x = sched_policy_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse CPU scheduling policy, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->cpu_sched_policy = x;
+ /* Moving to or from real-time policy? We need to adjust the priority */
+ c->cpu_sched_priority = CLAMP(c->cpu_sched_priority, sched_get_priority_min(x), sched_get_priority_max(x));
+ c->cpu_sched_set = true;
+
+ return 0;
+}
+
+int config_parse_exec_mount_apivfs(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int k;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->mount_apivfs_set = false;
+ c->mount_apivfs = false;
+ return 0;
+ }
+
+ k = parse_boolean(rvalue);
+ if (k < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, k,
+ "Failed to parse boolean value, ignoring: %s",
+ rvalue);
+ return 0;
+ }
+
+ c->mount_apivfs_set = true;
+ c->mount_apivfs = k;
+ return 0;
+}
+
+int config_parse_numa_mask(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ int r;
+ NUMAPolicy *p = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "all")) {
+ r = numa_mask_add_all(&p->nodes);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to create NUMA mask representing \"all\" NUMA nodes, ignoring: %m");
+ } else {
+ r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
+ }
+
+ return 0;
+}
+
+int config_parse_exec_cpu_sched_prio(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ int i, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = safe_atoi(rvalue, &i);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CPU scheduling priority, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* On Linux RR/FIFO range from 1 to 99 and OTHER/BATCH may only be 0. Policy might be set later so
+ * we do not check the precise range, but only the generic outer bounds. */
+ if (i < 0 || i > 99) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "CPU scheduling priority is out of range, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->cpu_sched_priority = i;
+ c->cpu_sched_set = true;
+
+ return 0;
+}
+
+int config_parse_root_image_options(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+ return 0;
+ }
+
+ r = strv_split_colon_pairs(&l, rvalue);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ STRV_FOREACH_PAIR(first, second, l) {
+ MountOptions *o = NULL;
+ _cleanup_free_ char *mount_options_resolved = NULL;
+ const char *mount_options = NULL, *partition = "root";
+ PartitionDesignator partition_designator;
+
+ /* Format is either 'root:foo' or 'foo' (root is implied) */
+ if (!isempty(*second)) {
+ partition = *first;
+ mount_options = *second;
+ } else
+ mount_options = *first;
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+ "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+
+ if (options)
+ LIST_JOIN(mount_options, c->root_image_options, options);
+ else
+ /* empty spaces/separators only */
+ c->root_image_options = mount_options_free_all(c->root_image_options);
+
+ return 0;
+}
+
+int config_parse_exec_root_hash(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *roothash_decoded = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t roothash_decoded_size = 0;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->root_hash_path = mfree(c->root_hash_path);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ return 0;
+ }
+
+ if (path_is_absolute(rvalue)) {
+ /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */
+ _cleanup_free_ char *p = NULL;
+
+ p = strdup(rvalue);
+ if (!p)
+ return -ENOMEM;
+
+ free_and_replace(c->root_hash_path, p);
+ c->root_hash = mfree(c->root_hash);
+ c->root_hash_size = 0;
+ return 0;
+ }
+
+ /* We have a roothash to decode, eg: RootHash=012345789abcdef */
+ r = unhexmem(rvalue, strlen(rvalue), &roothash_decoded, &roothash_decoded_size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHash=, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (roothash_decoded_size < sizeof(sd_id128_t)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "RootHash= is too short, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ free_and_replace(c->root_hash, roothash_decoded);
+ c->root_hash_size = roothash_decoded_size;
+ c->root_hash_path = mfree(c->root_hash_path);
+
+ return 0;
+}
+
+int config_parse_exec_root_hash_sig(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ void *roothash_sig_decoded = NULL;
+ char *value;
+ ExecContext *c = ASSERT_PTR(data);
+ size_t roothash_sig_decoded_size = 0;
+ int r;
+
+ assert(filename);
+ assert(line);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Reset if the empty string is assigned */
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ return 0;
+ }
+
+ if (path_is_absolute(rvalue)) {
+ /* We have the path to a roothash signature to load and decode, eg: RootHashSignature=/foo/bar.roothash.p7s */
+ _cleanup_free_ char *p = NULL;
+
+ p = strdup(rvalue);
+ if (!p)
+ return log_oom();
+
+ free_and_replace(c->root_hash_sig_path, p);
+ c->root_hash_sig = mfree(c->root_hash_sig);
+ c->root_hash_sig_size = 0;
+ return 0;
+ }
+
+ if (!(value = startswith(rvalue, "base64:"))) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Failed to decode RootHashSignature=, not a path but doesn't start with 'base64:', ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */
+ r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to decode RootHashSignature=, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ free_and_replace(c->root_hash_sig, roothash_sig_decoded);
+ c->root_hash_sig_size = roothash_sig_decoded_size;
+ c->root_hash_sig_path = mfree(c->root_hash_sig_path);
+
+ return 0;
+}
+
+int config_parse_exec_cpu_affinity(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "numa")) {
+ c->cpu_affinity_from_numa = true;
+ cpu_set_reset(&c->cpu_set);
+
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m",
+ rvalue);
+ return 0;
+ }
+
+ r = parse_cpu_set_extend(k, &c->cpu_set, true, unit, filename, line, lvalue);
+ if (r >= 0)
+ c->cpu_affinity_from_numa = false;
+
+ return 0;
+}
+
+int config_parse_capability_set(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint64_t *capability_set = ASSERT_PTR(data);
+ uint64_t sum = 0, initial, def;
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (streq(lvalue, "CapabilityBoundingSet")) {
+ initial = CAP_MASK_ALL; /* initialized to all bits on */
+ def = CAP_MASK_UNSET; /* not set */
+ } else
+ def = initial = 0; /* All bits off */
+
+ r = capability_set_from_string(rvalue, &sum);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= specifier '%s', ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+
+ if (sum == 0 || *capability_set == def)
+ /* "", "~" or uninitialized data -> replace */
+ *capability_set = invert ? ~sum : sum;
+ else {
+ /* previous data -> merge */
+ if (invert)
+ *capability_set &= ~sum;
+ else
+ *capability_set |= sum;
+ }
+
+ return 0;
+}
+
+int config_parse_exec_selinux_context(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->selinux_context = mfree(c->selinux_context);
+ c->selinux_context_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->selinux_context, k);
+ c->selinux_context_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_exec_apparmor_profile(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->apparmor_profile = mfree(c->apparmor_profile);
+ c->apparmor_profile_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->apparmor_profile, k);
+ c->apparmor_profile_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_exec_smack_process_label(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ bool ignore;
+ char *k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->smack_process_label = mfree(c->smack_process_label);
+ c->smack_process_label_ignore = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ ignore = true;
+ rvalue++;
+ } else
+ ignore = false;
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, ignore ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in '%s'%s: %m",
+ rvalue, ignore ? ", ignoring" : "");
+ return ignore ? 0 : -ENOEXEC;
+ }
+
+ free_and_replace(c->smack_process_label, k);
+ c->smack_process_label_ignore = ignore;
+
+ return 0;
+}
+
+int config_parse_timer(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = userdata;
+ Timer *t = ASSERT_PTR(data);
+ usec_t usec = 0;
+ TimerValue *v;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets list */
+ timer_free_values(t);
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (ltype == TIMER_CALENDAR) {
+ r = calendar_spec_from_string(k, &c);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse calendar specification, ignoring: %s", k);
+ return 0;
+ }
+ } else {
+ r = parse_sec(k, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", k);
+ return 0;
+ }
+ }
+
+ v = new(TimerValue, 1);
+ if (!v)
+ return log_oom();
+
+ *v = (TimerValue) {
+ .base = ltype,
+ .value = usec,
+ .calendar_spec = TAKE_PTR(c),
+ };
+
+ LIST_PREPEND(value, t->values, v);
+
+ return 0;
+}
+
+int config_parse_trigger_unit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Unit *u = ASSERT_PTR(data);
+ UnitType type;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (UNIT_TRIGGER(u)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Multiple units to trigger specified, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_name_printf(u, rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ type = unit_name_to_type(p);
+ if (type < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, type, "Unit type not valid, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (unit_has_name(u, p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Units cannot trigger themselves, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_BEFORE, UNIT_TRIGGERS, p, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add trigger on %s, ignoring: %m", p);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_path_spec(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Path *p = ASSERT_PTR(data);
+ PathSpec *s;
+ PathType b;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment clears list */
+ path_free_specs(p);
+ return 0;
+ }
+
+ b = path_type_from_string(lvalue);
+ if (b < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, b, "Failed to parse path type, ignoring: %s", lvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(UNIT(p), rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ s = new0(PathSpec, 1);
+ if (!s)
+ return log_oom();
+
+ s->unit = UNIT(p);
+ s->path = TAKE_PTR(k);
+ s->type = b;
+ s->inotify_fd = -EBADF;
+
+ LIST_PREPEND(spec, p->specs, s);
+
+ return 0;
+}
+
+int config_parse_socket_service(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *p = NULL;
+ Socket *s = ASSERT_PTR(data);
+ Unit *x;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_name_printf(UNIT(s), rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!endswith(p, ".service")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type service, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = manager_load_unit(UNIT(s)->manager, p, NULL, &error, &x);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load unit %s, ignoring: %s", rvalue, bus_error_message(&error, r));
+ return 0;
+ }
+
+ unit_ref_set(&s->service, UNIT(s), x);
+
+ return 0;
+}
+
+int config_parse_fdname(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Socket *s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ s->fdname = mfree(s->fdname);
+ return 0;
+ }
+
+ r = unit_fd_printf(UNIT(s), rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!fdname_is_valid(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid file descriptor name, ignoring: %s", p);
+ return 0;
+ }
+
+ return free_and_replace(s->fdname, p);
+}
+
+int config_parse_service_sockets(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Trailing garbage in sockets, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_name_printf(UNIT(s), word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ if (!endswith(k, ".socket")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unit must be of type socket, ignoring: %s", k);
+ continue;
+ }
+
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_WANTS, UNIT_AFTER, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_TRIGGERED_BY, k, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add dependency on %s, ignoring: %m", k);
+ }
+}
+
+int config_parse_bus_name(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf_full(u, rvalue, SD_BUS_MAXIMUM_NAME_LENGTH, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!sd_bus_service_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid bus name, ignoring: %s", k);
+ return 0;
+ }
+
+ return config_parse_string(unit, filename, line, section, section_line, lvalue, ltype, k, data, userdata);
+}
+
+int config_parse_service_timeout(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(userdata);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* This is called for two cases: TimeoutSec= and TimeoutStartSec=. */
+
+ /* Traditionally, these options accepted 0 to disable the timeouts. However, a timeout of 0 suggests it happens
+ * immediately, hence fix this to become USEC_INFINITY instead. This is in-line with how we internally handle
+ * all other timeouts. */
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= parameter, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ s->start_timeout_defined = true;
+ s->timeout_start_usec = usec;
+
+ if (streq(lvalue, "TimeoutSec"))
+ s->timeout_stop_usec = usec;
+
+ return 0;
+}
+
+int config_parse_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ usec_t *ret = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* Note: apart from setting the arg, this returns an extra bit of information in the return value. */
+
+ if (isempty(rvalue)) {
+ *ret = 0;
+ return 0; /* "not set" */
+ }
+
+ r = parse_sec(rvalue, ret);
+ if (r < 0)
+ return log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s= setting, ignoring: %s", lvalue, rvalue);
+
+ return 1; /* "set" */
+}
+
+int config_parse_service_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Service *s = ASSERT_PTR(userdata);
+ int r;
+
+ r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
+ &s->timeout_abort_usec, s);
+ if (r >= 0)
+ s->timeout_abort_set = r;
+ return 0;
+}
+
+int config_parse_user_group_compat(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ char **user = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *user = mfree(*user);
+ return 0;
+ }
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", rvalue);
+ return -ENOEXEC;
+ }
+
+ if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+ log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+ return -ENOEXEC;
+ }
+
+ if (strstr(lvalue, "User") && streq(k, NOBODY_USER_NAME))
+ log_struct(LOG_NOTICE,
+ "MESSAGE=%s:%u: Special user %s configured, this is not safe!", filename, line, k,
+ "UNIT=%s", unit,
+ "MESSAGE_ID=" SD_MESSAGE_NOBODY_USER_UNSUITABLE_STR,
+ "OFFENDING_USER=%s", k,
+ "CONFIG_FILE=%s", filename,
+ "CONFIG_LINE=%u", line);
+
+ return free_and_replace(*user, k);
+}
+
+int config_parse_user_group_strv_compat(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***users = data;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *users = strv_free(*users);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Invalid syntax: %s", rvalue);
+ return -ENOEXEC;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_full_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve unit specifiers in %s: %m", word);
+ return -ENOEXEC;
+ }
+
+ if (!valid_user_group_name(k, VALID_USER_ALLOW_NUMERIC|VALID_USER_RELAX|VALID_USER_WARN)) {
+ log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid user/group name or numeric ID: %s", k);
+ return -ENOEXEC;
+ }
+
+ r = strv_push(users, k);
+ if (r < 0)
+ return log_oom();
+
+ k = NULL;
+ }
+}
+
+int config_parse_working_directory(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = ASSERT_PTR(userdata);
+ bool missing_ok;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->working_directory_home = false;
+ c->working_directory = mfree(c->working_directory);
+ return 0;
+ }
+
+ if (rvalue[0] == '-') {
+ missing_ok = true;
+ rvalue++;
+ } else
+ missing_ok = false;
+
+ if (streq(rvalue, "~")) {
+ c->working_directory_home = true;
+ c->working_directory = mfree(c->working_directory);
+ } else {
+ _cleanup_free_ char *k = NULL;
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, missing_ok ? LOG_WARNING : LOG_ERR, filename, line, r,
+ "Failed to resolve unit specifiers in working directory path '%s'%s: %m",
+ rvalue, missing_ok ? ", ignoring" : "");
+ return missing_ok ? 0 : -ENOEXEC;
+ }
+
+ r = path_simplify_and_warn(k, PATH_CHECK_ABSOLUTE | (missing_ok ? 0 : PATH_CHECK_FATAL), unit, filename, line, lvalue);
+ if (r < 0)
+ return missing_ok ? 0 : -ENOEXEC;
+
+ c->working_directory_home = false;
+ free_and_replace(c->working_directory, k);
+ }
+
+ c->working_directory_missing_ok = missing_ok;
+ return 0;
+}
+
+int config_parse_unit_env_file(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***env = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ _cleanup_free_ char *n = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment frees the list */
+ *env = strv_free(*env);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, rvalue, PATH_MAX, &n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(n[0] == '-' ? n + 1 : n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = strv_push(env, n);
+ if (r < 0)
+ return log_oom();
+
+ n = NULL;
+
+ return 0;
+}
+
+int config_parse_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ char ***env = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *env = strv_free(*env);
+ return 0;
+ }
+
+ /* If 'u' is set, we operate on the regular unit specifier table. Otherwise we use a manager-specific
+ * specifier table (in which case ltype must contain the runtime scope). */
+ const Specifier *table = u ? NULL : (const Specifier[]) {
+ COMMON_SYSTEM_SPECIFIERS,
+ COMMON_TMP_SPECIFIERS,
+ COMMON_CREDS_SPECIFIERS(ltype),
+ { 'h', specifier_user_home, NULL },
+ { 's', specifier_user_shell, NULL },
+ };
+
+ for (const char *p = rvalue;; ) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ if (table)
+ r = specifier_printf(word, sc_arg_max(), table, NULL, NULL, &resolved);
+ else
+ r = unit_env_printf(u, word, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+
+ if (!env_assignment_is_valid(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment assignment, ignoring: %s", resolved);
+ continue;
+ }
+
+ r = strv_env_replace_consume(env, TAKE_PTR(resolved));
+ if (r < 0)
+ return log_error_errno(r, "Failed to update environment: %m");
+ }
+}
+
+int config_parse_pass_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **n = NULL;
+ const Unit *u = userdata;
+ char*** passenv = ASSERT_PTR(data);
+ size_t nlen = 0;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *passenv = strv_free(*passenv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+ if (r == 0)
+ break;
+
+ if (u) {
+ r = unit_env_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+ } else
+ k = TAKE_PTR(word);
+
+ if (!env_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment name for %s, ignoring: %s", lvalue, k);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(k);
+ n[nlen] = NULL;
+ }
+
+ if (n) {
+ r = strv_extend_strv(passenv, n, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_unset_environ(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **n = NULL;
+ char*** unsetenv = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ size_t nlen = 0;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *unsetenv = strv_free(*unsetenv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+ if (r == 0)
+ break;
+
+ if (u) {
+ r = unit_env_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+ } else
+ k = TAKE_PTR(word);
+
+ if (!env_assignment_is_valid(k) && !env_name_is_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid environment name or assignment %s, ignoring: %s", lvalue, k);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(n, nlen + 2))
+ return log_oom();
+
+ n[nlen++] = TAKE_PTR(k);
+ n[nlen] = NULL;
+ }
+
+ if (n) {
+ r = strv_extend_strv(unsetenv, n, true);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_log_extra_fields(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ exec_context_free_log_extra_fields(c);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ struct iovec *t;
+ const char *eq;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_full_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", word);
+ continue;
+ }
+
+ eq = strchr(k, '=');
+ if (!eq) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field lacks '=' character, ignoring: %s", k);
+ continue;
+ }
+
+ if (!journal_field_valid(k, eq-k, false)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Log field name is invalid, ignoring: %s", k);
+ continue;
+ }
+
+ t = reallocarray(c->log_extra_fields, c->n_log_extra_fields+1, sizeof(struct iovec));
+ if (!t)
+ return log_oom();
+
+ c->log_extra_fields = t;
+ c->log_extra_fields[c->n_log_extra_fields++] = IOVEC_MAKE_STRING(k);
+
+ k = NULL;
+ }
+}
+
+int config_parse_log_namespace(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL;
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->log_namespace = mfree(c->log_namespace);
+ return 0;
+ }
+
+ r = unit_full_printf_full(u, rvalue, NAME_MAX, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (!log_namespace_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Specified log namespace name is not valid, ignoring: %s", k);
+ return 0;
+ }
+
+ free_and_replace(c->log_namespace, k);
+ return 0;
+}
+
+int config_parse_unit_condition_path(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *p = NULL;
+ Condition **list = ASSERT_PTR(data), *c;
+ ConditionType t = ltype;
+ bool trigger, negate;
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *list = condition_free_list(*list);
+ return 0;
+ }
+
+ trigger = rvalue[0] == '|';
+ if (trigger)
+ rvalue++;
+
+ negate = rvalue[0] == '!';
+ if (negate)
+ rvalue++;
+
+ r = unit_path_printf(u, rvalue, &p);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(p, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ c = condition_new(t, p, trigger, negate);
+ if (!c)
+ return log_oom();
+
+ LIST_PREPEND(conditions, *list, c);
+ return 0;
+}
+
+int config_parse_unit_condition_string(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *s = NULL;
+ Condition **list = ASSERT_PTR(data), *c;
+ ConditionType t = ltype;
+ bool trigger, negate;
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *list = condition_free_list(*list);
+ return 0;
+ }
+
+ trigger = *rvalue == '|';
+ if (trigger)
+ rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+ negate = *rvalue == '!';
+ if (negate)
+ rvalue += 1 + strspn(rvalue + 1, WHITESPACE);
+
+ r = unit_full_printf(u, rvalue, &s);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ c = condition_new(t, s, trigger, negate);
+ if (!c)
+ return log_oom();
+
+ LIST_PREPEND(conditions, *list, c);
+ return 0;
+}
+
+int config_parse_unit_requires_mounts_for(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = unit_path_printf(u, word, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", word);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ r = unit_require_mounts_for(u, resolved, UNIT_DEPENDENCY_FILE);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to add required mount '%s', ignoring: %m", resolved);
+ continue;
+ }
+ }
+}
+
+int config_parse_documentation(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+ char **a, **b;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ u->documentation = strv_free(u->documentation);
+ return 0;
+ }
+
+ r = config_parse_unit_strv_printf(unit, filename, line, section, section_line, lvalue, ltype,
+ rvalue, data, userdata);
+ if (r < 0)
+ return r;
+
+ for (a = b = u->documentation; a && *a; a++) {
+
+ if (documentation_url_is_valid(*a))
+ *(b++) = *a;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid URL, ignoring: %s", *a);
+ free(*a);
+ }
+ }
+ if (b)
+ *b = NULL;
+
+ return 0;
+}
+
+#if HAVE_SECCOMP
+int config_parse_syscall_filter(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ _unused_ const Unit *u = ASSERT_PTR(userdata);
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->syscall_filter = hashmap_free(c->syscall_filter);
+ c->syscall_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->syscall_filter) {
+ c->syscall_filter = hashmap_new(NULL);
+ if (!c->syscall_filter)
+ return log_oom();
+
+ if (invert)
+ /* Allow everything but the ones listed */
+ c->syscall_allow_list = false;
+ else {
+ /* Allow nothing but the ones listed */
+ c->syscall_allow_list = true;
+
+ /* Accept default syscalls if we are on an allow_list */
+ r = seccomp_parse_syscall_filter(
+ "@default", -1, c->syscall_filter,
+ SECCOMP_PARSE_PERMISSIVE|SECCOMP_PARSE_ALLOW_LIST,
+ unit,
+ NULL, 0);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *name = NULL;
+ int num;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = parse_syscall_and_errno(word, &name, &num);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse syscall:errno, ignoring: %s", word);
+ continue;
+ }
+ if (!invert && num >= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Allow-listed system calls cannot take error number, ignoring: %s", word);
+ continue;
+ }
+
+ r = seccomp_parse_syscall_filter(
+ name, num, c->syscall_filter,
+ SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+ (invert ? SECCOMP_PARSE_INVERT : 0)|
+ (c->syscall_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+}
+
+int config_parse_syscall_log(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ _unused_ const Unit *u = ASSERT_PTR(userdata);
+ bool invert = false;
+ const char *p;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->syscall_log = hashmap_free(c->syscall_log);
+ c->syscall_log_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->syscall_log) {
+ c->syscall_log = hashmap_new(NULL);
+ if (!c->syscall_log)
+ return log_oom();
+
+ if (invert)
+ /* Log everything but the ones listed */
+ c->syscall_log_allow_list = false;
+ else
+ /* Log nothing but the ones listed */
+ c->syscall_log_allow_list = true;
+ }
+
+ p = rvalue;
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = seccomp_parse_syscall_filter(
+ word, -1, c->syscall_log,
+ SECCOMP_PARSE_LOG|SECCOMP_PARSE_PERMISSIVE|
+ (invert ? SECCOMP_PARSE_INVERT : 0)|
+ (c->syscall_log_allow_list ? SECCOMP_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+}
+
+int config_parse_syscall_archs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Set **archs = data;
+ int r;
+
+ if (isempty(rvalue)) {
+ *archs = set_free(*archs);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ uint32_t a;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = seccomp_arch_from_string(word, &a);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse system call architecture \"%s\", ignoring: %m", word);
+ continue;
+ }
+
+ r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_syscall_errno(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ int e;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue) || streq(rvalue, "kill")) {
+ /* Empty assignment resets to KILL */
+ c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
+ return 0;
+ }
+
+ e = parse_errno(rvalue);
+ if (e < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, e, "Failed to parse error number, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (e == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid error number, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->syscall_errno = e;
+ return 0;
+}
+
+int config_parse_address_families(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->address_families = set_free(c->address_families);
+ c->address_families_allow_list = false;
+ return 0;
+ }
+
+ if (streq(rvalue, "none")) {
+ /* Forbid all address families. */
+ c->address_families = set_free(c->address_families);
+ c->address_families_allow_list = true;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->address_families) {
+ c->address_families = set_new(NULL);
+ if (!c->address_families)
+ return log_oom();
+
+ c->address_families_allow_list = !invert;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ int af;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ af = af_from_name(word);
+ if (af < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, af,
+ "Failed to parse address family, ignoring: %s", word);
+ continue;
+ }
+
+ /* If we previously wanted to forbid an address family and now
+ * we want to allow it, then just remove it from the list.
+ */
+ if (!invert == c->address_families_allow_list) {
+ r = set_put(c->address_families, INT_TO_PTR(af));
+ if (r < 0)
+ return log_oom();
+ } else
+ set_remove(c->address_families, INT_TO_PTR(af));
+ }
+}
+
+int config_parse_restrict_namespaces(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = data;
+ unsigned long flags;
+ bool invert = false;
+ int r;
+
+ if (isempty(rvalue)) {
+ /* Reset to the default. */
+ c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
+ return 0;
+ }
+
+ /* Boolean parameter ignores the previous settings */
+ r = parse_boolean(rvalue);
+ if (r > 0) {
+ c->restrict_namespaces = 0;
+ return 0;
+ } else if (r == 0) {
+ c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ /* Not a boolean argument, in this case it's a list of namespace types. */
+ r = namespace_flags_from_string(rvalue, &flags);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (c->restrict_namespaces == NAMESPACE_FLAGS_INITIAL)
+ /* Initial assignment. Just set the value. */
+ c->restrict_namespaces = invert ? (~flags) & NAMESPACE_FLAGS_ALL : flags;
+ else
+ /* Merge the value with the previous one. */
+ SET_FLAG(c->restrict_namespaces, flags, !invert);
+
+ return 0;
+}
+#endif
+
+int config_parse_restrict_filesystems(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ ExecContext *c = ASSERT_PTR(data);
+ bool invert = false;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->restrict_filesystems = set_free_free(c->restrict_filesystems);
+ c->restrict_filesystems_allow_list = false;
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ invert = true;
+ rvalue++;
+ }
+
+ if (!c->restrict_filesystems) {
+ if (invert)
+ /* Allow everything but the ones listed */
+ c->restrict_filesystems_allow_list = false;
+ else
+ /* Allow nothing but the ones listed */
+ c->restrict_filesystems_allow_list = true;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+
+ r = lsm_bpf_parse_filesystem(
+ word,
+ &c->restrict_filesystems,
+ FILESYSTEM_PARSE_LOG|
+ (invert ? FILESYSTEM_PARSE_INVERT : 0)|
+ (c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
+ unit, filename, line);
+
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int config_parse_unit_slice(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *k = NULL;
+ Unit *u = userdata, *slice;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(u);
+
+ r = unit_name_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = manager_load_unit(u->manager, k, NULL, &error, &slice);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to load slice unit %s, ignoring: %s", k, bus_error_message(&error, r));
+ return 0;
+ }
+
+ r = unit_set_slice(u, slice);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to assign slice %s to unit %s, ignoring: %m", slice->id, u->id);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_cpu_quota(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ c->cpu_quota_per_sec_usec = USEC_INFINITY;
+ return 0;
+ }
+
+ r = parse_permyriad_unbounded(rvalue);
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid CPU quota '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ c->cpu_quota_per_sec_usec = ((usec_t) r * USEC_PER_SEC) / 10000U;
+ return 0;
+}
+
+int config_parse_allowed_cpuset(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CPUSet *c = data;
+ const Unit *u = userdata;
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = unit_full_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m",
+ rvalue);
+ return 0;
+ }
+
+ (void) parse_cpu_set_extend(k, c, true, unit, filename, line, lvalue);
+ return 0;
+}
+
+int config_parse_memory_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ uint64_t bytes = CGROUP_LIMIT_MAX;
+ int r;
+
+ if (isempty(rvalue) && STR_IN_SET(lvalue, "DefaultMemoryLow",
+ "DefaultMemoryMin",
+ "MemoryLow",
+ "StartupMemoryLow",
+ "MemoryMin"))
+ bytes = CGROUP_LIMIT_MIN;
+ else if (!isempty(rvalue) && !streq(rvalue, "infinity")) {
+
+ r = parse_permyriad(rvalue);
+ if (r < 0) {
+ r = parse_size(rvalue, 1024, &bytes);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid memory limit '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+ } else
+ bytes = physical_memory_scale(r, 10000U);
+
+ if (bytes >= UINT64_MAX ||
+ (bytes <= 0 && !STR_IN_SET(lvalue,
+ "MemorySwapMax",
+ "StartupMemorySwapMax",
+ "MemoryZSwapMax",
+ "StartupMemoryZSwapMax",
+ "MemoryLow",
+ "StartupMemoryLow",
+ "MemoryMin",
+ "DefaultMemoryLow",
+ "DefaultstartupMemoryLow",
+ "DefaultMemoryMin"))) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Memory limit '%s' out of range, ignoring.", rvalue);
+ return 0;
+ }
+ }
+
+ if (streq(lvalue, "DefaultMemoryLow")) {
+ c->default_memory_low = bytes;
+ c->default_memory_low_set = true;
+ } else if (streq(lvalue, "DefaultStartupMemoryLow")) {
+ c->default_startup_memory_low = bytes;
+ c->default_startup_memory_low_set = true;
+ } else if (streq(lvalue, "DefaultMemoryMin")) {
+ c->default_memory_min = bytes;
+ c->default_memory_min_set = true;
+ } else if (streq(lvalue, "MemoryMin")) {
+ c->memory_min = bytes;
+ c->memory_min_set = true;
+ } else if (streq(lvalue, "MemoryLow")) {
+ c->memory_low = bytes;
+ c->memory_low_set = true;
+ } else if (streq(lvalue, "StartupMemoryLow")) {
+ c->startup_memory_low = bytes;
+ c->startup_memory_low_set = true;
+ } else if (streq(lvalue, "MemoryHigh"))
+ c->memory_high = bytes;
+ else if (streq(lvalue, "StartupMemoryHigh")) {
+ c->startup_memory_high = bytes;
+ c->startup_memory_high_set = true;
+ } else if (streq(lvalue, "MemoryMax"))
+ c->memory_max = bytes;
+ else if (streq(lvalue, "StartupMemoryMax")) {
+ c->startup_memory_max = bytes;
+ c->startup_memory_max_set = true;
+ } else if (streq(lvalue, "MemorySwapMax"))
+ c->memory_swap_max = bytes;
+ else if (streq(lvalue, "StartupMemorySwapMax")) {
+ c->startup_memory_swap_max = bytes;
+ c->startup_memory_swap_max_set = true;
+ } else if (streq(lvalue, "MemoryZSwapMax"))
+ c->memory_zswap_max = bytes;
+ else if (streq(lvalue, "StartupMemoryZSwapMax")) {
+ c->startup_memory_zswap_max = bytes;
+ c->startup_memory_zswap_max_set = true;
+ } else if (streq(lvalue, "MemoryLimit")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses MemoryLimit=; please use MemoryMax= instead. Support for MemoryLimit= will be removed soon.");
+ c->memory_limit = bytes;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+int config_parse_tasks_max(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ CGroupTasksMax *tasks_max = data;
+ uint64_t v;
+ int r;
+
+ if (isempty(rvalue)) {
+ *tasks_max = u ? u->manager->defaults.tasks_max : CGROUP_TASKS_MAX_UNSET;
+ return 0;
+ }
+
+ if (streq(rvalue, "infinity")) {
+ *tasks_max = CGROUP_TASKS_MAX_UNSET;
+ return 0;
+ }
+
+ r = parse_permyriad(rvalue);
+ if (r >= 0)
+ *tasks_max = (CGroupTasksMax) { r, 10000U }; /* r‱ */
+ else {
+ r = safe_atou64(rvalue, &v);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (v <= 0 || v >= UINT64_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue);
+ return 0;
+ }
+
+ *tasks_max = (CGroupTasksMax) { v };
+ }
+
+ return 0;
+}
+
+int config_parse_delegate(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = data;
+ UnitType t;
+ int r;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_delegate) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Delegate= setting not supported for this unit type, ignoring.");
+ return 0;
+ }
+
+ /* We either accept a boolean value, which may be used to turn on delegation for all controllers, or
+ * turn it off for all. Or it takes a list of controller names, in which case we add the specified
+ * controllers to the mask to delegate. Delegate= enables delegation without any controllers. */
+
+ if (isempty(rvalue)) {
+ /* An empty string resets controllers and sets Delegate=yes. */
+ c->delegate = true;
+ c->delegate_controllers = 0;
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ CGroupMask mask = 0;
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ CGroupController cc;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ cc = cgroup_controller_from_string(word);
+ if (cc < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid controller name '%s', ignoring", word);
+ continue;
+ }
+
+ mask |= CGROUP_CONTROLLER_TO_MASK(cc);
+ }
+
+ c->delegate = true;
+ c->delegate_controllers |= mask;
+
+ } else if (r > 0) {
+ c->delegate = true;
+ c->delegate_controllers = CGROUP_MASK_DELEGATE;
+ } else {
+ c->delegate = false;
+ c->delegate_controllers = 0;
+ }
+
+ return 0;
+}
+
+int config_parse_delegate_subgroup(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = ASSERT_PTR(data);
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t >= 0);
+
+ if (!unit_vtable[t]->can_delegate) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
+ return 0;
+ }
+
+ if (isempty(rvalue)) {
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ return 0;
+ }
+
+ if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
+}
+
+int config_parse_managed_oom_mode(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ManagedOOMMode *mode = data, m;
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ *mode = MANAGED_OOM_AUTO;
+ return 0;
+ }
+
+ m = managed_oom_mode_from_string(rvalue);
+ if (m < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, m, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *mode = m;
+ return 0;
+}
+
+int config_parse_managed_oom_mem_pressure_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t *limit = data;
+ UnitType t;
+ int r;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ *limit = 0;
+ return 0;
+ }
+
+ r = parse_permyriad(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* Normalize to 2^32-1 == 100% */
+ *limit = UINT32_SCALE_FROM_PERMYRIAD(r);
+ return 0;
+}
+
+int config_parse_device_allow(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupDevicePermissions permissions;
+ CGroupContext *c = data;
+ const char *p = rvalue;
+ int r;
+
+ if (isempty(rvalue)) {
+ while (c->device_allow)
+ cgroup_context_free_device_allow(c, c->device_allow);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and rights from '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ if (!STARTSWITH_SET(resolved, "block-", "char-")) {
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (!valid_device_node_path(resolved)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid device node path '%s', ignoring.", resolved);
+ return 0;
+ }
+ }
+
+ permissions = isempty(p) ? 0 : cgroup_device_permissions_from_string(p);
+ if (permissions < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, permissions, "Invalid device rights '%s', ignoring.", p);
+ return 0;
+ }
+
+ return cgroup_context_add_device_allow(c, resolved, permissions);
+}
+
+int config_parse_io_device_weight(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceWeight *w;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t u;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->io_device_weights)
+ cgroup_context_free_io_device_weight(c, c->io_device_weights);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and weight from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device path or weight specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cg_weight_parse(p, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "IO weight '%s' invalid, ignoring: %m", p);
+ return 0;
+ }
+
+ assert(u != CGROUP_WEIGHT_INVALID);
+
+ w = new0(CGroupIODeviceWeight, 1);
+ if (!w)
+ return log_oom();
+
+ w->path = TAKE_PTR(resolved);
+ w->weight = u;
+
+ LIST_PREPEND(device_weights, c->io_device_weights, w);
+ return 0;
+}
+
+int config_parse_io_device_latency(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceLatency *l;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->io_device_latencies)
+ cgroup_context_free_io_device_latency(c, c->io_device_latencies);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device path and latency from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device path or latency specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = parse_sec(p, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse timer value, ignoring: %s", p);
+ return 0;
+ }
+
+ l = new0(CGroupIODeviceLatency, 1);
+ if (!l)
+ return log_oom();
+
+ l->path = TAKE_PTR(resolved);
+ l->target_usec = usec;
+
+ LIST_PREPEND(device_latencies, c->io_device_latencies, l);
+ return 0;
+}
+
+int config_parse_io_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupIODeviceLimit *l = NULL;
+ CGroupContext *c = data;
+ CGroupIOLimitType type;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t num;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ type = cgroup_io_limit_type_from_string(lvalue);
+ assert(type >= 0);
+
+ if (isempty(rvalue)) {
+ LIST_FOREACH(device_limits, t, c->io_device_limits)
+ t->limits[type] = cgroup_io_limit_defaults[type];
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (streq("infinity", p))
+ num = CGROUP_LIMIT_MAX;
+ else {
+ r = parse_size(p, 1000, &num);
+ if (r < 0 || num <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid IO limit '%s', ignoring.", p);
+ return 0;
+ }
+ }
+
+ LIST_FOREACH(device_limits, t, c->io_device_limits)
+ if (path_equal(resolved, t->path)) {
+ l = t;
+ break;
+ }
+
+ if (!l) {
+ l = new0(CGroupIODeviceLimit, 1);
+ if (!l)
+ return log_oom();
+
+ l->path = TAKE_PTR(resolved);
+ for (CGroupIOLimitType i = 0; i < _CGROUP_IO_LIMIT_TYPE_MAX; i++)
+ l->limits[i] = cgroup_io_limit_defaults[i];
+
+ LIST_PREPEND(device_limits, c->io_device_limits, l);
+ }
+
+ l->limits[type] = num;
+
+ return 0;
+}
+
+int config_parse_blockio_device_weight(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupBlockIODeviceWeight *w;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t u;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->blockio_device_weights)
+ cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and weight from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or weight specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cg_blkio_weight_parse(p, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid block IO weight '%s', ignoring: %m", p);
+ return 0;
+ }
+
+ assert(u != CGROUP_BLKIO_WEIGHT_INVALID);
+
+ w = new0(CGroupBlockIODeviceWeight, 1);
+ if (!w)
+ return log_oom();
+
+ w->path = TAKE_PTR(resolved);
+ w->weight = u;
+
+ LIST_PREPEND(device_weights, c->blockio_device_weights, w);
+ return 0;
+}
+
+int config_parse_blockio_bandwidth(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *path = NULL, *resolved = NULL;
+ CGroupBlockIODeviceBandwidth *b = NULL;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ uint64_t bytes;
+ bool read;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unit uses %s=; please use IO*= settings instead. Support for %s= will be removed soon.",
+ lvalue, lvalue);
+
+ read = streq("BlockIOReadBandwidth", lvalue);
+
+ if (isempty(rvalue)) {
+ LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths) {
+ t->rbps = CGROUP_LIMIT_MAX;
+ t->wbps = CGROUP_LIMIT_MAX;
+ }
+ return 0;
+ }
+
+ r = extract_first_word(&p, &path, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract device node and bandwidth from '%s', ignoring.", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid device node or bandwidth specified in '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(userdata, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in '%s', ignoring: %m", path);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, 0, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = parse_size(p, 1000, &bytes);
+ if (r < 0 || bytes <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid Block IO Bandwidth '%s', ignoring.", p);
+ return 0;
+ }
+
+ LIST_FOREACH(device_bandwidths, t, c->blockio_device_bandwidths)
+ if (path_equal(resolved, t->path)) {
+ b = t;
+ break;
+ }
+
+ if (!b) {
+ b = new0(CGroupBlockIODeviceBandwidth, 1);
+ if (!b)
+ return log_oom();
+
+ b->path = TAKE_PTR(resolved);
+ b->rbps = CGROUP_LIMIT_MAX;
+ b->wbps = CGROUP_LIMIT_MAX;
+
+ LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, b);
+ }
+
+ if (read)
+ b->rbps = bytes;
+ else
+ b->wbps = bytes;
+
+ return 0;
+}
+
+int config_parse_job_mode_isolate(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ JobMode *m = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ log_notice("%s is deprecated. Please use OnFailureJobMode= instead", lvalue);
+
+ *m = r ? JOB_ISOLATE : JOB_REPLACE;
+ return 0;
+}
+
+int config_parse_exec_directories(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecDirectory *ed = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ exec_directory_done(ed);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *tuple = NULL;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ _cleanup_free_ char *src = NULL, *dest = NULL;
+ const char *q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &src, &dest, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+
+ _cleanup_free_ char *sresolved = NULL;
+ r = unit_path_printf(u, src, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", src);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ if (path_startswith(sresolved, "private")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "%s= path can't be 'private', ignoring assignment: %s", lvalue, tuple);
+ continue;
+ }
+
+ /* For State and Runtime directories we support an optional destination parameter, which
+ * will be used to create a symlink to the source. */
+ _cleanup_free_ char *dresolved = NULL;
+ if (!isempty(dest)) {
+ if (streq(lvalue, "ConfigurationDirectory")) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Destination parameter is not supported for ConfigurationDirectory, ignoring: %s", tuple);
+ continue;
+ }
+
+ r = unit_path_printf(u, dest, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", dest);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_RELATIVE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+ }
+
+ r = exec_directory_add(ed, sresolved, dresolved);
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_set_credential(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *word = NULL, *k = NULL;
+ _cleanup_free_ void *d = NULL;
+ ExecContext *context = ASSERT_PTR(data);
+ ExecSetCredential *old;
+ Unit *u = userdata;
+ bool encrypted = ltype;
+ const char *p = ASSERT_PTR(rvalue);
+ size_t size;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ context->set_credentials = hashmap_free(context->set_credentials);
+ return 0;
+ }
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract credential name, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_cred_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+ return 0;
+ }
+ if (!credential_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+ return 0;
+ }
+
+ if (encrypted) {
+ r = unbase64mem_full(p, SIZE_MAX, true, &d, &size);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Encrypted credential data not valid Base64 data, ignoring.");
+ return 0;
+ }
+ } else {
+ char *unescaped;
+ ssize_t l;
+
+ /* We support escape codes here, so that users can insert trailing \n if they like */
+ l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped);
+ if (l < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, l, "Can't unescape \"%s\", ignoring: %m", p);
+ return 0;
+ }
+
+ d = unescaped;
+ size = l;
+ }
+
+ old = hashmap_get(context->set_credentials, k);
+ if (old) {
+ free_and_replace(old->data, d);
+ old->size = size;
+ old->encrypted = encrypted;
+ } else {
+ _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
+
+ sc = new(ExecSetCredential, 1);
+ if (!sc)
+ return log_oom();
+
+ *sc = (ExecSetCredential) {
+ .id = TAKE_PTR(k),
+ .data = TAKE_PTR(d),
+ .size = size,
+ .encrypted = encrypted,
+ };
+
+ r = hashmap_ensure_put(&context->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Duplicated credential value '%s', ignoring assignment: %s", sc->id, rvalue);
+ return 0;
+ }
+
+ TAKE_PTR(sc);
+ }
+
+ return 0;
+}
+
+int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted) {
+ ExecLoadCredential *old;
+ int r;
+
+ assert(h);
+ assert(id);
+ assert(path);
+
+ old = hashmap_get(*h, id);
+ if (old) {
+ r = free_and_strdup(&old->path, path);
+ if (r < 0)
+ return r;
+
+ old->encrypted = encrypted;
+ } else {
+ _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
+
+ lc = new(ExecLoadCredential, 1);
+ if (!lc)
+ return log_oom();
+
+ *lc = (ExecLoadCredential) {
+ .id = strdup(id),
+ .path = strdup(path),
+ .encrypted = encrypted,
+ };
+ if (!lc->id || !lc->path)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(h, &exec_load_credential_hash_ops, lc->id, lc);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(lc);
+ }
+
+ return 0;
+}
+
+int config_parse_load_credential(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *word = NULL, *k = NULL, *q = NULL;
+ ExecContext *context = ASSERT_PTR(data);
+ bool encrypted = ltype;
+ Unit *u = userdata;
+ const char *p;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ context->load_credentials = hashmap_free(context->load_credentials);
+ return 0;
+ }
+
+ p = rvalue;
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = unit_cred_printf(u, word, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", word);
+ return 0;
+ }
+ if (!credential_name_valid(k)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name \"%s\" not valid, ignoring.", k);
+ return 0;
+ }
+
+ if (isempty(p)) {
+ /* If only one field is specified take it as shortcut for inheriting a credential named
+ * the same way from our parent */
+ q = strdup(k);
+ if (!q)
+ return log_oom();
+ } else {
+ r = unit_path_printf(u, p, &q);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", p);
+ return 0;
+ }
+ if (path_is_absolute(q) ? !path_is_normalized(q) : !credential_name_valid(q)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential source \"%s\" not valid, ignoring.", q);
+ return 0;
+ }
+ }
+
+ r = hashmap_put_credential(&context->load_credentials, k, q, encrypted);
+ if (r < 0)
+ return log_error_errno(r, "Failed to store load credential '%s': %m", rvalue);
+
+ return 0;
+}
+
+int config_parse_import_credential(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *s = NULL;
+ Set** import_credentials = ASSERT_PTR(data);
+ Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *import_credentials = set_free_free(*import_credentials);
+ return 0;
+ }
+
+ r = unit_cred_printf(u, rvalue, &s);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+ return 0;
+ }
+ if (!credential_glob_valid(s)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Credential name or glob \"%s\" not valid, ignoring.", s);
+ return 0;
+ }
+
+ r = set_put_strdup(import_credentials, s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to store credential name '%s': %m", rvalue);
+
+ return 0;
+}
+
+int config_parse_set_status(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExitStatusSet *status_set = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* Empty assignment resets the list */
+ if (isempty(rvalue)) {
+ exit_status_set_free(status_set);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ Bitmap *bitmap;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ /* We need to call exit_status_from_string() first, because we want
+ * to parse numbers as exit statuses, not signals. */
+
+ r = exit_status_from_string(word);
+ if (r >= 0) {
+ assert(r >= 0 && r < 256);
+ bitmap = &status_set->status;
+ } else {
+ r = signal_from_string(word);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse value, ignoring: %s", word);
+ continue;
+ }
+ bitmap = &status_set->signal;
+ }
+
+ r = bitmap_set(bitmap, r);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to set signal or status %s, ignoring: %m", word);
+ }
+}
+
+int config_parse_namespace_path_strv(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ char*** sv = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL;
+ const char *w;
+ bool ignore_enoent = false, shall_prefix = false;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ w = word;
+ if (startswith(w, "-")) {
+ ignore_enoent = true;
+ w++;
+ }
+ if (startswith(w, "+")) {
+ shall_prefix = true;
+ w++;
+ }
+
+ r = unit_path_printf(u, w, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s: %m", w);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ joined = strjoin(ignore_enoent ? "-" : "",
+ shall_prefix ? "+" : "",
+ resolved);
+
+ r = strv_push(sv, joined);
+ if (r < 0)
+ return log_oom();
+
+ joined = NULL;
+ }
+
+ return 0;
+}
+
+int config_parse_temporary_filesystems(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const Unit *u = userdata;
+ ExecContext *c = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
+ c->temporary_filesystems = NULL;
+ c->n_temporary_filesystems = 0;
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL, *path = NULL, *resolved = NULL;
+ const char *w;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ w = word;
+ r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to extract first word, ignoring: %s", word);
+ continue;
+ }
+ if (r == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", word);
+ continue;
+ }
+
+ r = unit_path_printf(u, path, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", path);
+ continue;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ r = temporary_filesystem_add(&c->temporary_filesystems, &c->n_temporary_filesystems, resolved, w);
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_bind_paths(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
+ c->bind_mounts = NULL;
+ c->n_bind_mounts = 0;
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+ char *s = NULL, *d = NULL;
+ bool rbind = true, ignore_enoent = false;
+
+ r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ r = unit_full_printf_full(u, source, PATH_MAX, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", source);
+ continue;
+ }
+
+ s = sresolved;
+ if (s[0] == '-') {
+ ignore_enoent = true;
+ s++;
+ }
+
+ r = path_simplify_and_warn(s, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ /* Optionally, the destination is specified. */
+ if (p && p[-1] == ':') {
+ r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing argument after ':', ignoring: %s", s);
+ continue;
+ }
+
+ r = unit_path_printf(u, destination, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in \"%s\", ignoring: %m", destination);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ d = dresolved;
+
+ /* Optionally, there's also a short option string specified */
+ if (p && p[-1] == ':') {
+ _cleanup_free_ char *options = NULL;
+
+ r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (isempty(options) || streq(options, "rbind"))
+ rbind = true;
+ else if (streq(options, "norbind"))
+ rbind = false;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid option string, ignoring setting: %s", options);
+ continue;
+ }
+ }
+ } else
+ d = s;
+
+ r = bind_mount_add(&c->bind_mounts, &c->n_bind_mounts,
+ &(BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = !!strstr(lvalue, "ReadOnly"),
+ .recursive = rbind,
+ .ignore_enoent = ignore_enoent,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_mount_images(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL;
+ _cleanup_free_ char *sresolved = NULL, *dresolved = NULL;
+ const char *q = NULL;
+ char *s = NULL;
+ bool permissive = false;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+ if (r == 0)
+ continue;
+
+ s = first;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ r = unit_path_printf(u, s, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ if (isempty(second)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Missing destination in %s, ignoring: %s", lvalue, rvalue);
+ continue;
+ }
+
+ r = unit_path_printf(u, second, &dresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve specifiers in \"%s\", ignoring: %m", second);
+ continue;
+ }
+
+ r = path_simplify_and_warn(dresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+ return 0;
+ }
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = unit_full_printf(u, partition, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", first);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ break;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, partition_designator,
+ "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->mount_images, &c->n_mount_images,
+ &(MountImage) {
+ .source = sresolved,
+ .destination = dresolved,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_DISCRETE,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_extension_images(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const Unit *u = userdata;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *source = NULL, *tuple = NULL, *sresolved = NULL;
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ bool permissive = false;
+ const char *q = NULL;
+ char *s = NULL;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax %s=%s, ignoring: %m", lvalue, rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ q = tuple;
+ r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax in %s=, ignoring: %s", lvalue, tuple);
+ return 0;
+ }
+ if (r == 0)
+ continue;
+
+ s = source;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ r = unit_path_printf(u, s, &sresolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit specifiers in \"%s\", ignoring: %m", s);
+ continue;
+ }
+
+ r = path_simplify_and_warn(sresolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ continue;
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL, *mount_options_resolved = NULL;
+ MountOptions *o = NULL;
+ PartitionDesignator partition_designator;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", q);
+ return 0;
+ }
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = unit_full_printf(u, partition, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", partition);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = PARTITION_ROOT,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+
+ break;
+ }
+
+ partition_designator = partition_designator_from_string(partition);
+ if (partition_designator < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid partition name %s, ignoring", partition);
+ continue;
+ }
+ r = unit_full_printf(u, mount_options, &mount_options_resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in %s, ignoring: %m", mount_options);
+ continue;
+ }
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return log_oom();
+ *o = (MountOptions) {
+ .partition_designator = partition_designator,
+ .options = TAKE_PTR(mount_options_resolved),
+ };
+ LIST_APPEND(mount_options, options, o);
+ }
+
+ r = mount_image_add(&c->extension_images, &c->n_extension_images,
+ &(MountImage) {
+ .source = sresolved,
+ .mount_options = options,
+ .ignore_enoent = permissive,
+ .type = MOUNT_IMAGE_EXTENSION,
+ });
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_job_timeout_sec(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(data);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobTimeoutSec= parameter, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ /* If the user explicitly changed JobTimeoutSec= also change JobRunningTimeoutSec=, for compatibility with old
+ * versions. If JobRunningTimeoutSec= was explicitly set, avoid this however as whatever the user picked should
+ * count. */
+
+ if (!u->job_running_timeout_set)
+ u->job_running_timeout = usec;
+
+ u->job_timeout = usec;
+
+ return 0;
+}
+
+int config_parse_job_running_timeout_sec(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Unit *u = ASSERT_PTR(data);
+ usec_t usec;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_sec_fix_0(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse JobRunningTimeoutSec= parameter, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ u->job_running_timeout = usec;
+ u->job_running_timeout_set = true;
+
+ return 0;
+}
+
+int config_parse_emergency_action(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ EmergencyAction *x = ASSERT_PTR(data);
+ RuntimeScope runtime_scope;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* If we have a unit determine the scope based on it */
+ if (unit)
+ runtime_scope = ((Unit*) ASSERT_PTR(userdata))->manager->runtime_scope;
+ else
+ runtime_scope = ltype; /* otherwise, assume the scope is passed in via ltype */
+
+ r = parse_emergency_action(rvalue, runtime_scope, x);
+ if (r < 0) {
+ if (r == -EOPNOTSUPP)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "%s= specified as %s mode action, ignoring: %s",
+ lvalue, runtime_scope_to_string(runtime_scope), rvalue);
+ else
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_pid_file(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *k = NULL, *n = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ char **s = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* An empty assignment removes already set value. */
+ *s = mfree(*s);
+ return 0;
+ }
+
+ r = unit_path_printf(u, rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ /* If this is a relative path make it absolute by prefixing the /run */
+ n = path_make_absolute(k, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+ if (!n)
+ return log_oom();
+
+ /* Check that the result is a sensible path */
+ r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return r;
+
+ r = patch_var_run(unit, filename, line, lvalue, &n);
+ if (r < 0)
+ return r;
+
+ free_and_replace(*s, n);
+ return 0;
+}
+
+int config_parse_exit_status(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *exit_status = data, r;
+ uint8_t u;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(exit_status);
+
+ if (isempty(rvalue)) {
+ *exit_status = -1;
+ return 0;
+ }
+
+ r = safe_atou8(rvalue, &u);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse exit status '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ *exit_status = u;
+ return 0;
+}
+
+int config_parse_disable_controllers(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r;
+ CGroupContext *c = data;
+ CGroupMask disabled_mask;
+
+ /* 1. If empty, make all controllers eligible for use again.
+ * 2. If non-empty, merge all listed controllers, space separated. */
+
+ if (isempty(rvalue)) {
+ c->disable_controllers = 0;
+ return 0;
+ }
+
+ r = cg_mask_from_string(rvalue, &disabled_mask);
+ if (r < 0 || disabled_mask <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
+ return 0;
+ }
+
+ c->disable_controllers |= disabled_mask;
+
+ return 0;
+}
+
+int config_parse_ip_filter_bpf_progs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *resolved = NULL;
+ const Unit *u = userdata;
+ char ***paths = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *paths = strv_free(*paths);
+ return 0;
+ }
+
+ r = unit_path_printf(u, rvalue, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ if (strv_contains(*paths, resolved))
+ return 0;
+
+ r = strv_extend(paths, resolved);
+ if (r < 0)
+ return log_oom();
+
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
+ static bool warned = false;
+
+ log_full(warned ? LOG_DEBUG : LOG_WARNING,
+ "File %s:%u configures an IP firewall with BPF programs (%s=%s), but the local system does not support BPF/cgroup based firewalling with multiple filters.\n"
+ "Starting this unit will fail! (This warning is only shown for the first loaded unit using IP firewalling.)", filename, line, lvalue, rvalue);
+
+ warned = true;
+ }
+
+ return 0;
+}
+
+int config_parse_bpf_foreign_program(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ _cleanup_free_ char *resolved = NULL, *word = NULL;
+ CGroupContext *c = data;
+ const char *p = ASSERT_PTR(rvalue);
+ Unit *u = userdata;
+ int attach_type, r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ while (c->bpf_foreign_programs)
+ cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs);
+
+ return 0;
+ }
+
+ r = extract_first_word(&p, &word, ":", 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse foreign BPF program, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0 || isempty(p)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax in %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ attach_type = bpf_cgroup_attach_type_from_string(word);
+ if (attach_type < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown BPF attach type=%s, ignoring: %s", word, rvalue);
+ return 0;
+ }
+
+ r = unit_path_printf(u, p, &resolved);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %s", p, rvalue);
+ return 0;
+ }
+
+ r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue);
+ if (r < 0)
+ return 0;
+
+ r = cgroup_context_add_bpf_foreign_program(c, attach_type, resolved);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add foreign BPF program to cgroup context: %m");
+
+ return 0;
+}
+
+int config_parse_cgroup_socket_bind(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ _cleanup_free_ CGroupSocketBindItem *item = NULL;
+ CGroupSocketBindItem **head = data;
+ uint16_t nr_ports, port_min;
+ int af, ip_protocol, r;
+
+ if (isempty(rvalue)) {
+ cgroup_context_remove_socket_bind(head);
+ return 0;
+ }
+
+ r = parse_socket_bind_item(rvalue, &af, &ip_protocol, &nr_ports, &port_min);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Unable to parse %s= assignment, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ item = new(CGroupSocketBindItem, 1);
+ if (!item)
+ return log_oom();
+ *item = (CGroupSocketBindItem) {
+ .address_family = af,
+ .ip_protocol = ip_protocol,
+ .nr_ports = nr_ports,
+ .port_min = port_min,
+ };
+
+ LIST_PREPEND(socket_bind_items, *head, TAKE_PTR(item));
+
+ return 0;
+}
+
+int config_parse_restrict_network_interfaces(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ CGroupContext *c = ASSERT_PTR(data);
+ bool is_allow_rule = true;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ c->restrict_network_interfaces = set_free_free(c->restrict_network_interfaces);
+ return 0;
+ }
+
+ if (rvalue[0] == '~') {
+ is_allow_rule = false;
+ rvalue++;
+ }
+
+ if (set_isempty(c->restrict_network_interfaces))
+ /* Only initialize this when creating the set */
+ c->restrict_network_interfaces_is_allow_list = is_allow_rule;
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
+ break;
+ }
+
+ if (!ifname_valid(word)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid interface name, ignoring: %s", word);
+ continue;
+ }
+
+ if (c->restrict_network_interfaces_is_allow_list != is_allow_rule)
+ free(set_remove(c->restrict_network_interfaces, word));
+ else {
+ r = set_put_strdup(&c->restrict_network_interfaces, word);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ return 0;
+}
+
+static int merge_by_names(Unit *u, Set *names, const char *id) {
+ char *k;
+ int r;
+
+ assert(u);
+
+ /* Let's try to add in all names that are aliases of this unit */
+ while ((k = set_steal_first(names))) {
+ _cleanup_free_ _unused_ char *free_k = k;
+
+ /* First try to merge in the other name into our unit */
+ r = unit_merge_by_name(u, k);
+ if (r < 0) {
+ Unit *other;
+
+ /* Hmm, we couldn't merge the other unit into ours? Then let's try it the other way
+ * round. */
+
+ other = manager_get_unit(u->manager, k);
+ if (!other)
+ return r; /* return previous failure */
+
+ r = unit_merge(other, u);
+ if (r < 0)
+ return r;
+
+ return merge_by_names(other, names, NULL);
+ }
+
+ if (streq_ptr(id, k))
+ unit_choose_id(u, id);
+ }
+
+ return 0;
+}
+
+int unit_load_fragment(Unit *u) {
+ const char *fragment;
+ _cleanup_set_free_free_ Set *names = NULL;
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+ assert(u->id);
+
+ if (u->transient) {
+ u->access_selinux_context = mfree(u->access_selinux_context);
+ u->load_state = UNIT_LOADED;
+ return 0;
+ }
+
+ /* Possibly rebuild the fragment map to catch new units */
+ r = unit_file_build_name_map(&u->manager->lookup_paths,
+ &u->manager->unit_cache_timestamp_hash,
+ &u->manager->unit_id_map,
+ &u->manager->unit_name_map,
+ &u->manager->unit_path_cache);
+ if (r < 0)
+ return log_error_errno(r, "Failed to rebuild name map: %m");
+
+ r = unit_file_find_fragment(u->manager->unit_id_map,
+ u->manager->unit_name_map,
+ u->id,
+ &fragment,
+ &names);
+ if (r < 0 && r != -ENOENT)
+ return r;
+
+ if (fragment) {
+ /* Open the file, check if this is a mask, otherwise read. */
+ _cleanup_fclose_ FILE *f = NULL;
+ struct stat st;
+
+ /* Try to open the file name. A symlink is OK, for example for linked files or masks. We
+ * expect that all symlinks within the lookup paths have been already resolved, but we don't
+ * verify this here. */
+ f = fopen(fragment, "re");
+ if (!f)
+ return log_unit_notice_errno(u, errno, "Failed to open %s: %m", fragment);
+
+ if (fstat(fileno(f), &st) < 0)
+ return -errno;
+
+ r = free_and_strdup(&u->fragment_path, fragment);
+ if (r < 0)
+ return r;
+
+ if (null_or_empty(&st)) {
+ /* Unit file is masked */
+
+ u->load_state = u->perpetual ? UNIT_LOADED : UNIT_MASKED; /* don't allow perpetual units to ever be masked */
+ u->fragment_mtime = 0;
+ u->access_selinux_context = mfree(u->access_selinux_context);
+ } else {
+#if HAVE_SELINUX
+ if (mac_selinux_use()) {
+ _cleanup_freecon_ char *selcon = NULL;
+
+ /* Cache the SELinux context of the unit file here. We'll make use of when checking access permissions to loaded units */
+ r = fgetfilecon_raw(fileno(f), &selcon);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to read SELinux context of '%s', ignoring: %m", fragment);
+
+ r = free_and_strdup(&u->access_selinux_context, selcon);
+ if (r < 0)
+ return r;
+ } else
+#endif
+ u->access_selinux_context = mfree(u->access_selinux_context);
+
+ u->load_state = UNIT_LOADED;
+ u->fragment_mtime = timespec_load(&st.st_mtim);
+
+ /* Now, parse the file contents */
+ r = config_parse(u->id, fragment, f,
+ UNIT_VTABLE(u)->sections,
+ config_item_perf_lookup, load_fragment_gperf_lookup,
+ 0,
+ u,
+ NULL);
+ if (r == -ENOEXEC)
+ log_unit_notice_errno(u, r, "Unit configuration has fatal error, unit will not be started.");
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Call merge_by_names with the name derived from the fragment path as the preferred name.
+ *
+ * We do the merge dance here because for some unit types, the unit might have aliases which are not
+ * declared in the file system. In particular, this is true (and frequent) for device and swap units.
+ */
+ const char *id = u->id;
+ _cleanup_free_ char *filename = NULL, *free_id = NULL;
+
+ if (fragment) {
+ r = path_extract_filename(fragment, &filename);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract filename from fragment '%s': %m", fragment);
+ id = filename;
+
+ if (unit_name_is_valid(id, UNIT_NAME_TEMPLATE)) {
+ assert(u->instance); /* If we're not trying to use a template for non-instanced unit,
+ * this must be set. */
+
+ r = unit_name_replace_instance(id, u->instance, &free_id);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to build id (%s + %s): %m", id, u->instance);
+ id = free_id;
+ }
+ }
+
+ return merge_by_names(u, names, id);
+}
+
+void unit_dump_config_items(FILE *f) {
+ static const struct {
+ const ConfigParserCallback callback;
+ const char *rvalue;
+ } table[] = {
+ { config_parse_warn_compat, "NOTSUPPORTED" },
+ { config_parse_int, "INTEGER" },
+ { config_parse_unsigned, "UNSIGNED" },
+ { config_parse_iec_size, "SIZE" },
+ { config_parse_iec_uint64, "SIZE" },
+ { config_parse_si_uint64, "SIZE" },
+ { config_parse_bool, "BOOLEAN" },
+ { config_parse_string, "STRING" },
+ { config_parse_path, "PATH" },
+ { config_parse_unit_path_printf, "PATH" },
+ { config_parse_colon_separated_paths, "PATH" },
+ { config_parse_strv, "STRING [...]" },
+ { config_parse_exec_nice, "NICE" },
+ { config_parse_exec_oom_score_adjust, "OOMSCOREADJUST" },
+ { config_parse_exec_io_class, "IOCLASS" },
+ { config_parse_exec_io_priority, "IOPRIORITY" },
+ { config_parse_exec_cpu_sched_policy, "CPUSCHEDPOLICY" },
+ { config_parse_exec_cpu_sched_prio, "CPUSCHEDPRIO" },
+ { config_parse_exec_cpu_affinity, "CPUAFFINITY" },
+ { config_parse_mode, "MODE" },
+ { config_parse_unit_env_file, "FILE" },
+ { config_parse_exec_output, "OUTPUT" },
+ { config_parse_exec_input, "INPUT" },
+ { config_parse_log_facility, "FACILITY" },
+ { config_parse_log_level, "LEVEL" },
+ { config_parse_exec_secure_bits, "SECUREBITS" },
+ { config_parse_capability_set, "BOUNDINGSET" },
+ { config_parse_rlimit, "LIMIT" },
+ { config_parse_unit_deps, "UNIT [...]" },
+ { config_parse_exec, "PATH [ARGUMENT [...]]" },
+ { config_parse_service_type, "SERVICETYPE" },
+ { config_parse_service_exit_type, "SERVICEEXITTYPE" },
+ { config_parse_service_restart, "SERVICERESTART" },
+ { config_parse_service_restart_mode, "SERVICERESTARTMODE" },
+ { config_parse_service_timeout_failure_mode, "TIMEOUTMODE" },
+ { config_parse_kill_mode, "KILLMODE" },
+ { config_parse_signal, "SIGNAL" },
+ { config_parse_socket_listen, "SOCKET [...]" },
+ { config_parse_socket_bind, "SOCKETBIND" },
+ { config_parse_socket_bindtodevice, "NETWORKINTERFACE" },
+ { config_parse_sec, "SECONDS" },
+ { config_parse_nsec, "NANOSECONDS" },
+ { config_parse_namespace_path_strv, "PATH [...]" },
+ { config_parse_bind_paths, "PATH[:PATH[:OPTIONS]] [...]" },
+ { config_parse_unit_requires_mounts_for,
+ "PATH [...]" },
+ { config_parse_exec_mount_propagation_flag,
+ "MOUNTFLAG" },
+ { config_parse_unit_string_printf, "STRING" },
+ { config_parse_trigger_unit, "UNIT" },
+ { config_parse_timer, "TIMER" },
+ { config_parse_path_spec, "PATH" },
+ { config_parse_notify_access, "ACCESS" },
+ { config_parse_ip_tos, "TOS" },
+ { config_parse_unit_condition_path, "CONDITION" },
+ { config_parse_unit_condition_string, "CONDITION" },
+ { config_parse_unit_slice, "SLICE" },
+ { config_parse_documentation, "URL" },
+ { config_parse_service_timeout, "SECONDS" },
+ { config_parse_emergency_action, "ACTION" },
+ { config_parse_set_status, "STATUS" },
+ { config_parse_service_sockets, "SOCKETS" },
+ { config_parse_environ, "ENVIRON" },
+#if HAVE_SECCOMP
+ { config_parse_syscall_filter, "SYSCALLS" },
+ { config_parse_syscall_archs, "ARCHS" },
+ { config_parse_syscall_errno, "ERRNO" },
+ { config_parse_syscall_log, "SYSCALLS" },
+ { config_parse_address_families, "FAMILIES" },
+ { config_parse_restrict_namespaces, "NAMESPACES" },
+#endif
+ { config_parse_restrict_filesystems, "FILESYSTEMS" },
+ { config_parse_cpu_shares, "SHARES" },
+ { config_parse_cg_weight, "WEIGHT" },
+ { config_parse_cg_cpu_weight, "CPUWEIGHT" },
+ { config_parse_memory_limit, "LIMIT" },
+ { config_parse_device_allow, "DEVICE" },
+ { config_parse_device_policy, "POLICY" },
+ { config_parse_io_limit, "LIMIT" },
+ { config_parse_io_device_weight, "DEVICEWEIGHT" },
+ { config_parse_io_device_latency, "DEVICELATENCY" },
+ { config_parse_blockio_bandwidth, "BANDWIDTH" },
+ { config_parse_blockio_weight, "WEIGHT" },
+ { config_parse_blockio_device_weight, "DEVICEWEIGHT" },
+ { config_parse_long, "LONG" },
+ { config_parse_socket_service, "SERVICE" },
+#if HAVE_SELINUX
+ { config_parse_exec_selinux_context, "LABEL" },
+#endif
+ { config_parse_job_mode, "MODE" },
+ { config_parse_job_mode_isolate, "BOOLEAN" },
+ { config_parse_personality, "PERSONALITY" },
+ { config_parse_log_filter_patterns, "REGEX" },
+ };
+
+ const char *prev = NULL;
+
+ assert(f);
+
+ NULSTR_FOREACH(i, load_fragment_gperf_nulstr) {
+ const char *rvalue = "OTHER", *lvalue;
+ const ConfigPerfItem *p;
+ const char *dot;
+
+ assert_se(p = load_fragment_gperf_lookup(i, strlen(i)));
+
+ /* Hide legacy settings */
+ if (p->parse == config_parse_warn_compat &&
+ p->ltype == DISABLED_LEGACY)
+ continue;
+
+ for (size_t j = 0; j < ELEMENTSOF(table); j++)
+ if (p->parse == table[j].callback) {
+ rvalue = table[j].rvalue;
+ break;
+ }
+
+ dot = strchr(i, '.');
+ lvalue = dot ? dot + 1 : i;
+
+ if (dot) {
+ size_t prefix_len = dot - i;
+
+ if (!prev || !strneq(prev, i, prefix_len+1)) {
+ if (prev)
+ fputc('\n', f);
+
+ fprintf(f, "[%.*s]\n", (int) prefix_len, i);
+ }
+ }
+
+ fprintf(f, "%s=%s\n", lvalue, rvalue);
+ prev = i;
+ }
+}
+
+int config_parse_cpu_affinity2(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CPUSet *affinity = ASSERT_PTR(data);
+
+ (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue);
+
+ return 0;
+}
+
+int config_parse_show_status(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int k;
+ ShowStatus *b = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ k = parse_show_status(rvalue, b);
+ if (k < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, k, "Failed to parse show status setting, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_output_restricted(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecOutput t, *eo = ASSERT_PTR(data);
+ bool obsolete = false;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "syslog")) {
+ t = EXEC_OUTPUT_JOURNAL;
+ obsolete = true;
+ } else if (streq(rvalue, "syslog+console")) {
+ t = EXEC_OUTPUT_JOURNAL_AND_CONSOLE;
+ obsolete = true;
+ } else {
+ t = exec_output_from_string(rvalue);
+ if (t < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, t, "Failed to parse output type, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (IN_SET(t, EXEC_OUTPUT_SOCKET, EXEC_OUTPUT_NAMED_FD, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Standard output types socket, fd:, file:, append:, truncate: are not supported as defaults, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ if (obsolete)
+ log_syntax(unit, LOG_NOTICE, filename, line, 0,
+ "Standard output type %s is obsolete, automatically updating to %s. Please update your configuration.",
+ rvalue, exec_output_to_string(t));
+
+ *eo = t;
+ return 0;
+}
+
+int config_parse_crash_chvt(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ r = parse_crash_chvt(rvalue, data);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse CrashChangeVT= setting, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_swap_priority(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Swap *s = ASSERT_PTR(userdata);
+ int r, priority;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ s->parameters_fragment.priority = -1;
+ s->parameters_fragment.priority_set = false;
+ return 0;
+ }
+
+ r = safe_atoi(rvalue, &priority);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid swap priority '%s', ignoring.", rvalue);
+ return 0;
+ }
+
+ if (priority < -1) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Sorry, swap priorities smaller than -1 may only be assigned by the kernel itself, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (priority > 32767) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Swap priority out of range, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ s->parameters_fragment.priority = priority;
+ s->parameters_fragment.priority_set = true;
+ return 0;
+}
+
+int config_parse_watchdog_sec(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ usec_t *usec = data;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* This is called for {Runtime,Reboot,KExec}WatchdogSec= where "default" maps to
+ * USEC_INFINITY internally. */
+
+ if (streq(rvalue, "default"))
+ *usec = USEC_INFINITY;
+ else if (streq(rvalue, "off"))
+ *usec = 0;
+ else
+ return config_parse_sec(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+
+ return 0;
+}
+
+int config_parse_tty_size(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ unsigned *sz = data;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *sz = UINT_MAX;
+ return 0;
+ }
+
+ return config_parse_unsigned(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_log_filter_patterns(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ExecContext *c = ASSERT_PTR(data);
+ const char *pattern = ASSERT_PTR(rvalue);
+ bool is_allowlist = true;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(pattern)) {
+ /* Empty assignment resets the lists. */
+ c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
+ c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
+ return 0;
+ }
+
+ if (pattern[0] == '~') {
+ is_allowlist = false;
+ pattern++;
+ if (isempty(pattern))
+ /* LogFilterPatterns=~ is not considered a valid pattern. */
+ return log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Regex pattern invalid, ignoring: %s=%s", lvalue, rvalue);
+ }
+
+ if (pattern_compile_and_log(pattern, 0, NULL) < 0)
+ return 0;
+
+ r = set_put_strdup(is_allowlist ? &c->log_filter_allowed_patterns : &c->log_filter_denied_patterns,
+ pattern);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to store log filtering pattern, ignoring: %s=%s", lvalue, rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_open_file(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(open_file_freep) OpenFile *of = NULL;
+ OpenFile **head = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ open_file_free_many(head);
+ return 0;
+ }
+
+ r = open_file_parse(rvalue, &of);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse OpenFile= setting, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ LIST_APPEND(open_files, *head, TAKE_PTR(of));
+
+ return 0;
+}
+
+int config_parse_cgroup_nft_set(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = ASSERT_PTR(data);
+ Unit *u = ASSERT_PTR(userdata);
+
+ return config_parse_nft_set(unit, filename, line, section, section_line, lvalue, ltype, rvalue, &c->nft_set_context, u);
+}
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
new file mode 100644
index 0000000..6919805
--- /dev/null
+++ b/src/core/load-fragment.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "unit.h"
+
+/* These functions are declared in the header to make them accessible to unit tests. */
+bool contains_instance_specifier_superset(const char *s);
+int unit_is_likely_recursive_template_dependency(Unit *u, const char *name, const char *format);
+
+/* Config-parsing helpers relevant only for sources under src/core/ */
+int parse_crash_chvt(const char *value, int *data);
+int parse_confirm_spawn(const char *value, char **console);
+
+int hashmap_put_credential(Hashmap **h, const char *id, const char *path, bool encrypted);
+
+/* Read service data from .desktop file style configuration fragments */
+
+int unit_load_fragment(Unit *u);
+
+void unit_dump_config_items(FILE *f);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_obsolete_unit_deps);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_string_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_colon_separated_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_path_strv_printf);
+CONFIG_PARSER_PROTOTYPE(config_parse_documentation);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_listen);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_protocol);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_coredump_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_failure_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_exit_type);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_restart);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_restart_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_output);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_text);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_input_data);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_class);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_io_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_sched_prio);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_cpu_affinity);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_apivfs);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_secure_bits);
+CONFIG_PARSER_PROTOTYPE(config_parse_root_image_options);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_root_hash_sig);
+CONFIG_PARSER_PROTOTYPE(config_parse_capability_set);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_mount_propagation_flag);
+CONFIG_PARSER_PROTOTYPE(config_parse_timer);
+CONFIG_PARSER_PROTOTYPE(config_parse_trigger_unit);
+CONFIG_PARSER_PROTOTYPE(config_parse_path_spec);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_service);
+CONFIG_PARSER_PROTOTYPE(config_parse_service_sockets);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_env_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_tos);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_path);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_condition_string);
+CONFIG_PARSER_PROTOTYPE(config_parse_kill_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_notify_access);
+CONFIG_PARSER_PROTOTYPE(config_parse_emergency_action);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_requires_mounts_for);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_archs);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_errno);
+CONFIG_PARSER_PROTOTYPE(config_parse_syscall_log);
+CONFIG_PARSER_PROTOTYPE(config_parse_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_pass_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unset_environ);
+CONFIG_PARSER_PROTOTYPE(config_parse_unit_slice);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cg_cpu_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_io_limit);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_device_weight);
+CONFIG_PARSER_PROTOTYPE(config_parse_blockio_bandwidth);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_mode_isolate);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_selinux_context);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_apparmor_profile);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_smack_process_label);
+CONFIG_PARSER_PROTOTYPE(config_parse_address_families);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_preserve_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_directories);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_load_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_import_credential);
+CONFIG_PARSER_PROTOTYPE(config_parse_set_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
+CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
+CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_home);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_system);
+CONFIG_PARSER_PROTOTYPE(config_parse_bus_name);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_utmp_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_working_directory);
+CONFIG_PARSER_PROTOTYPE(config_parse_fdname);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
+CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
+CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
+CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_namespace);
+CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
+CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_filter_bpf_progs);
+CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity2);
+CONFIG_PARSER_PROTOTYPE(config_parse_show_status);
+CONFIG_PARSER_PROTOTYPE(config_parse_status_unit_format);
+CONFIG_PARSER_PROTOTYPE(config_parse_output_restricted);
+CONFIG_PARSER_PROTOTYPE(config_parse_crash_chvt);
+CONFIG_PARSER_PROTOTYPE(config_parse_timeout_abort);
+CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority);
+CONFIG_PARSER_PROTOTYPE(config_parse_mount_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping);
+CONFIG_PARSER_PROTOTYPE(config_parse_extension_images);
+CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program);
+CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_socket_bind);
+CONFIG_PARSER_PROTOTYPE(config_parse_restrict_network_interfaces);
+CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_tty_size);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns);
+CONFIG_PARSER_PROTOTYPE(config_parse_open_file);
+CONFIG_PARSER_PROTOTYPE(config_parse_memory_pressure_watch);
+CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_nft_set);
+
+/* gperf prototypes */
+const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
+extern const char load_fragment_gperf_nulstr[];
diff --git a/src/core/main.c b/src/core/main.c
new file mode 100644
index 0000000..3f71cc0
--- /dev/null
+++ b/src/core/main.c
@@ -0,0 +1,3227 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/oom.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#if HAVE_VALGRIND_VALGRIND_H
+# include <valgrind/valgrind.h>
+#endif
+
+#include "sd-bus.h"
+#include "sd-daemon.h"
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "apparmor-setup.h"
+#include "architecture.h"
+#include "argv-util.h"
+#if HAVE_LIBBPF
+#include "bpf-lsm.h"
+#endif
+#include "build.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "capability-util.h"
+#include "cgroup-util.h"
+#include "chase.h"
+#include "clock-util.h"
+#include "conf-parser.h"
+#include "confidential-virt.h"
+#include "copy.h"
+#include "cpu-set-util.h"
+#include "crash-handler.h"
+#include "dbus-manager.h"
+#include "dbus.h"
+#include "constants.h"
+#include "dev-setup.h"
+#include "efi-random.h"
+#include "efivars.h"
+#include "emergency-action.h"
+#include "env-util.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fdset.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "getopt-defs.h"
+#include "hexdecoct.h"
+#include "hostname-setup.h"
+#include "ima-setup.h"
+#include "import-creds.h"
+#include "initrd-util.h"
+#include "killall.h"
+#include "kmod-setup.h"
+#include "limits-util.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "loopback-setup.h"
+#include "machine-id-setup.h"
+#include "main.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "os-util.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "random-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "smack-setup.h"
+#include "special.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "switch-root.h"
+#include "sysctl-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+#include "version.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+#include <sanitizer/lsan_interface.h>
+#endif
+
+static enum {
+ ACTION_RUN,
+ ACTION_HELP,
+ ACTION_VERSION,
+ ACTION_TEST,
+ ACTION_DUMP_CONFIGURATION_ITEMS,
+ ACTION_DUMP_BUS_PROPERTIES,
+ ACTION_BUS_INTROSPECT,
+} arg_action = ACTION_RUN;
+
+static const char *arg_bus_introspect = NULL;
+
+/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
+ * defaults are assigned in reset_arguments() below. */
+static char *arg_default_unit;
+static RuntimeScope arg_runtime_scope;
+bool arg_dump_core;
+int arg_crash_chvt;
+bool arg_crash_shell;
+bool arg_crash_reboot;
+static char *arg_confirm_spawn;
+static ShowStatus arg_show_status;
+static StatusUnitFormat arg_status_unit_format;
+static bool arg_switched_root;
+static PagerFlags arg_pager_flags;
+static bool arg_service_watchdogs;
+static UnitDefaults arg_defaults;
+static usec_t arg_runtime_watchdog;
+static usec_t arg_reboot_watchdog;
+static usec_t arg_kexec_watchdog;
+static usec_t arg_pretimeout_watchdog;
+static char *arg_early_core_pattern;
+static char *arg_watchdog_pretimeout_governor;
+static char *arg_watchdog_device;
+static char **arg_default_environment;
+static char **arg_manager_environment;
+static uint64_t arg_capability_bounding_set;
+static bool arg_no_new_privs;
+static nsec_t arg_timer_slack_nsec;
+static Set* arg_syscall_archs;
+static FILE* arg_serialization;
+static sd_id128_t arg_machine_id;
+static EmergencyAction arg_cad_burst_action;
+static CPUSet arg_cpu_affinity;
+static NUMAPolicy arg_numa_policy;
+static usec_t arg_clock_usec;
+static void *arg_random_seed;
+static size_t arg_random_seed_size;
+static usec_t arg_reload_limit_interval_sec;
+static unsigned arg_reload_limit_burst;
+
+/* A copy of the original environment block */
+static char **saved_env = NULL;
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock);
+
+static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
+ _cleanup_free_ char *base = NULL;
+ _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+ int r;
+
+ r = xdg_user_config_dir(&base, "/systemd");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(&files, "%s/user.conf", base);
+ if (r < 0)
+ return r;
+
+ r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
+ if (r < 0)
+ return r;
+
+ r = strv_consume(&dirs, TAKE_PTR(base));
+ if (r < 0)
+ return r;
+
+ r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
+ if (r < 0)
+ return r;
+
+ *ret_files = TAKE_PTR(files);
+ *ret_dirs = TAKE_PTR(dirs);
+ return 0;
+}
+
+static int console_setup(void) {
+ _cleanup_close_ int tty_fd = -EBADF;
+ unsigned rows, cols;
+ int r;
+
+ tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (tty_fd < 0)
+ return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
+
+ /* We don't want to force text mode. plymouth may be showing
+ * pictures already from initrd. */
+ r = reset_terminal_fd(tty_fd, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset /dev/console: %m");
+
+ r = proc_cmdline_tty_size("/dev/console", &rows, &cols);
+ if (r < 0)
+ log_warning_errno(r, "Failed to get terminal size, ignoring: %m");
+ else {
+ r = terminal_set_size_fd(tty_fd, NULL, rows, cols);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set terminal size, ignoring: %m");
+ }
+
+ return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+ int r;
+
+ assert(key);
+
+ if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
+ else if (in_initrd() == !!startswith(key, "rd."))
+ return free_and_strdup_warn(&arg_default_unit, value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
+ else
+ arg_dump_core = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (path_is_absolute(value))
+ (void) parse_path_argument(value, false, &arg_early_core_pattern);
+ else
+ log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
+
+ if (!value)
+ arg_crash_chvt = 0; /* turn on */
+ else {
+ r = parse_crash_chvt(value, &arg_crash_chvt);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
+ else
+ arg_crash_shell = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
+ else
+ arg_crash_reboot = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
+ char *s;
+
+ r = parse_confirm_spawn(value, &s);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
+ else
+ free_and_replace(arg_confirm_spawn, s);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
+
+ r = value ? parse_boolean(value) : true;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
+ else
+ arg_service_watchdogs = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
+
+ if (value) {
+ r = parse_show_status(value, &arg_show_status);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
+ } else
+ arg_show_status = SHOW_STATUS_YES;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = status_unit_format_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
+ else
+ arg_status_unit_format = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = exec_output_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
+ else
+ arg_defaults.std_output = r;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = exec_output_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
+ else
+ arg_defaults.std_error = r;
+
+ } else if (streq(key, "systemd.setenv")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (!env_assignment_is_valid(value))
+ log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
+ else {
+ r = strv_env_replace_strdup(&arg_default_environment, value);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = id128_from_string_nonzero(value, &arg_machine_id);
+ if (r < 0)
+ log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_sec(value, &arg_defaults.timeout_start_usec);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
+
+ if (arg_defaults.timeout_start_usec <= 0)
+ arg_defaults.timeout_start_usec = USEC_INFINITY;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_sec(value, &arg_defaults.device_timeout_usec);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value);
+
+ if (arg_defaults.device_timeout_usec <= 0)
+ arg_defaults.device_timeout_usec = USEC_INFINITY;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_cpu_set(value, &arg_cpu_affinity);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ (void) parse_path_argument(value, false, &arg_watchdog_device);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (streq(value, "default"))
+ arg_runtime_watchdog = USEC_INFINITY;
+ else if (streq(value, "off"))
+ arg_runtime_watchdog = 0;
+ else {
+ r = parse_sec(value, &arg_runtime_watchdog);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+ }
+
+ arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ if (streq(value, "default"))
+ arg_pretimeout_watchdog = USEC_INFINITY;
+ else if (streq(value, "off"))
+ arg_pretimeout_watchdog = 0;
+ else {
+ r = parse_sec(value, &arg_pretimeout_watchdog);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
+
+ if (proc_cmdline_value_missing(key, value) || isempty(value)) {
+ arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+ return 0;
+ }
+
+ if (!string_is_safe(value)) {
+ log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = safe_atou64(value, &arg_clock_usec);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
+
+ } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
+ void *p;
+ size_t sz;
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = unbase64mem(value, SIZE_MAX, &p, &sz);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
+
+ free(arg_random_seed);
+ arg_random_seed = sz > 0 ? p : mfree(p);
+ arg_random_seed_size = sz;
+
+ } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = parse_sec(value, &arg_reload_limit_interval_sec);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+
+ } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = safe_atou(value, &arg_reload_limit_burst);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value);
+ return 0;
+ }
+
+ } else if (streq(key, "quiet") && !value) {
+
+ if (arg_show_status == _SHOW_STATUS_INVALID)
+ arg_show_status = SHOW_STATUS_ERROR;
+
+ } else if (streq(key, "debug") && !value) {
+
+ /* Note that log_parse_environment() handles 'debug'
+ * too, and sets the log level to LOG_DEBUG. */
+
+ if (detect_container() > 0)
+ log_set_target(LOG_TARGET_CONSOLE);
+
+ } else if (!value) {
+ const char *target;
+
+ /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
+ target = runlevel_to_target(key);
+ if (target)
+ return free_and_strdup_warn(&arg_default_unit, target);
+ }
+
+ return 0;
+}
+
+#define DEFINE_SETTER(name, func, descr) \
+ static int name(const char *unit, \
+ const char *filename, \
+ unsigned line, \
+ const char *section, \
+ unsigned section_line, \
+ const char *lvalue, \
+ int ltype, \
+ const char *rvalue, \
+ void *data, \
+ void *userdata) { \
+ \
+ int r; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ \
+ r = func(rvalue); \
+ if (r < 0) \
+ log_syntax(unit, LOG_ERR, filename, line, r, \
+ "Invalid " descr "'%s': %m", \
+ rvalue); \
+ \
+ return 0; \
+ }
+
+DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
+DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
+DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
+DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
+DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
+
+static int config_parse_default_timeout_abort(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ int r;
+
+ r = config_parse_timeout_abort(
+ unit,
+ filename,
+ line,
+ section,
+ section_line,
+ lvalue,
+ ltype,
+ rvalue,
+ &arg_defaults.timeout_abort_usec,
+ userdata);
+ if (r >= 0)
+ arg_defaults.timeout_abort_set = r;
+ return 0;
+}
+
+static int config_parse_oom_score_adjust(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int oa, r;
+
+ if (isempty(rvalue)) {
+ arg_defaults.oom_score_adjust_set = false;
+ return 0;
+ }
+
+ r = parse_oom_score_adjust(rvalue, &oa);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse the OOM score adjust value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ arg_defaults.oom_score_adjust = oa;
+ arg_defaults.oom_score_adjust_set = true;
+
+ return 0;
+}
+
+static int parse_config_file(void) {
+ const ConfigTableItem items[] = {
+ { "Manager", "LogLevel", config_parse_level2, 0, NULL },
+ { "Manager", "LogTarget", config_parse_target, 0, NULL },
+ { "Manager", "LogColor", config_parse_color, 0, NULL },
+ { "Manager", "LogLocation", config_parse_location, 0, NULL },
+ { "Manager", "LogTime", config_parse_time, 0, NULL },
+ { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
+ { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
+ { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
+ { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
+ { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
+ { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
+ { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
+ { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
+ { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
+ { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
+ { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+ { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog },
+ { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog },
+ { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog },
+ { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
+ { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog },
+ { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
+ { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
+ { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
+ { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
+#if HAVE_SECCOMP
+ { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
+#else
+ { "Manager", "SystemCallArchitectures", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+
+#endif
+ { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
+ { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_defaults.timer_accuracy_usec },
+ { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_defaults.std_output },
+ { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_defaults.std_error },
+ { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_defaults.timeout_start_usec },
+ { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_defaults.timeout_stop_usec },
+ { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
+ { "Manager", "DefaultDeviceTimeoutSec", config_parse_sec, 0, &arg_defaults.device_timeout_usec },
+ { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_defaults.restart_usec },
+ { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_defaults.start_limit_interval}, /* obsolete alias */
+ { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_defaults.start_limit_interval},
+ { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_defaults.start_limit_burst },
+ { "Manager", "DefaultEnvironment", config_parse_environ, arg_runtime_scope, &arg_default_environment },
+ { "Manager", "ManagerEnvironment", config_parse_environ, arg_runtime_scope, &arg_manager_environment },
+ { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_defaults.rlimit },
+ { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_defaults.rlimit },
+ { "Manager", "DefaultCPUAccounting", config_parse_bool, 0, &arg_defaults.cpu_accounting },
+ { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_defaults.io_accounting },
+ { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_defaults.ip_accounting },
+ { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_defaults.blockio_accounting },
+ { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_defaults.memory_accounting },
+ { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_defaults.tasks_accounting },
+ { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_defaults.tasks_max },
+ { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.memory_pressure_threshold_usec },
+ { "Manager", "DefaultMemoryPressureWatch", config_parse_memory_pressure_watch, 0, &arg_defaults.memory_pressure_watch },
+ { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action },
+ { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy },
+ { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL },
+ { "Manager", "ReloadLimitIntervalSec", config_parse_sec, 0, &arg_reload_limit_interval_sec },
+ { "Manager", "ReloadLimitBurst", config_parse_unsigned, 0, &arg_reload_limit_burst },
+#if ENABLE_SMACK
+ { "Manager", "DefaultSmackProcessLabel", config_parse_string, 0, &arg_defaults.smack_process_label },
+#else
+ { "Manager", "DefaultSmackProcessLabel", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
+#endif
+ {}
+ };
+
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
+ (void) config_parse_config_file("system.conf",
+ "Manager\0",
+ config_item_table_lookup, items,
+ CONFIG_PARSE_WARN,
+ NULL);
+ else {
+ _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
+ int r;
+
+ assert(arg_runtime_scope == RUNTIME_SCOPE_USER);
+
+ r = manager_find_user_config_paths(&files, &dirs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine config file paths: %m");
+
+ (void) config_parse_many(
+ (const char* const*) files,
+ (const char* const*) dirs,
+ "user.conf.d",
+ /* root = */ NULL,
+ "Manager\0",
+ config_item_table_lookup, items,
+ CONFIG_PARSE_WARN,
+ NULL, NULL, NULL);
+ }
+
+ /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
+ * USEC_INFINITY like everywhere else. */
+ if (arg_defaults.timeout_start_usec <= 0)
+ arg_defaults.timeout_start_usec = USEC_INFINITY;
+ if (arg_defaults.timeout_stop_usec <= 0)
+ arg_defaults.timeout_stop_usec = USEC_INFINITY;
+
+ return 0;
+}
+
+static void set_manager_defaults(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Propagates the various default unit property settings into the manager object, i.e. properties
+ * that do not affect the manager itself, but are just what newly allocated units will have set if
+ * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the
+ * manager's own behaviour) */
+
+ r = manager_set_unit_defaults(m, &arg_defaults);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set manager defaults, ignoring: %m");
+
+ r = manager_default_environment(m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set manager default environment, ignoring: %m");
+
+ r = manager_transient_environment_add(m, arg_default_environment);
+ if (r < 0)
+ log_warning_errno(r, "Failed to add to transient environment, ignoring: %m");
+}
+
+static void set_manager_settings(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Propagates the various manager settings into the manager object, i.e. properties that
+ * effect the manager itself (as opposed to just being inherited into newly allocated
+ * units, see set_manager_defaults() above). */
+
+ m->confirm_spawn = arg_confirm_spawn;
+ m->service_watchdogs = arg_service_watchdogs;
+ m->cad_burst_action = arg_cad_burst_action;
+ /* Note that we don't do structured initialization here, otherwise it will reset the rate limit
+ * counter on every daemon-reload. */
+ m->reload_ratelimit.interval = arg_reload_limit_interval_sec;
+ m->reload_ratelimit.burst = arg_reload_limit_burst;
+
+ manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
+ manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
+ manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
+ manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
+ r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
+
+ manager_set_show_status(m, arg_show_status, "command line");
+ m->status_unit_format = arg_status_unit_format;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ COMMON_GETOPT_ARGS,
+ SYSTEMD_GETOPT_ARGS,
+ };
+
+ static const struct option options[] = {
+ COMMON_GETOPT_OPTIONS,
+ SYSTEMD_GETOPT_OPTIONS,
+ {}
+ };
+
+ int c, r;
+ bool user_arg_seen = false;
+
+ assert(argc >= 1);
+ assert(argv);
+
+ if (getpid_cached() == 1)
+ opterr = 0;
+
+ while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0)
+
+ switch (c) {
+
+ case ARG_LOG_LEVEL:
+ r = log_set_max_level_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_TARGET:
+ r = log_set_target_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_LOG_COLOR:
+
+ if (optarg) {
+ r = log_show_color_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_color(true);
+
+ break;
+
+ case ARG_LOG_LOCATION:
+ if (optarg) {
+ r = log_show_location_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_location(true);
+
+ break;
+
+ case ARG_LOG_TIME:
+
+ if (optarg) {
+ r = log_show_time_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
+ optarg);
+ } else
+ log_show_time(true);
+
+ break;
+
+ case ARG_DEFAULT_STD_OUTPUT:
+ r = exec_output_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
+ optarg);
+ arg_defaults.std_output = r;
+ break;
+
+ case ARG_DEFAULT_STD_ERROR:
+ r = exec_output_from_string(optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
+ optarg);
+ arg_defaults.std_error = r;
+ break;
+
+ case ARG_UNIT:
+ r = free_and_strdup(&arg_default_unit, optarg);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
+
+ break;
+
+ case ARG_SYSTEM:
+ arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+ break;
+
+ case ARG_USER:
+ arg_runtime_scope = RUNTIME_SCOPE_USER;
+ user_arg_seen = true;
+ break;
+
+ case ARG_TEST:
+ arg_action = ACTION_TEST;
+ break;
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
+ case ARG_VERSION:
+ arg_action = ACTION_VERSION;
+ break;
+
+ case ARG_DUMP_CONFIGURATION_ITEMS:
+ arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
+ break;
+
+ case ARG_DUMP_BUS_PROPERTIES:
+ arg_action = ACTION_DUMP_BUS_PROPERTIES;
+ break;
+
+ case ARG_BUS_INTROSPECT:
+ arg_bus_introspect = optarg;
+ arg_action = ACTION_BUS_INTROSPECT;
+ break;
+
+ case ARG_DUMP_CORE:
+ r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CRASH_CHVT:
+ r = parse_crash_chvt(optarg, &arg_crash_chvt);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
+ optarg);
+ break;
+
+ case ARG_CRASH_SHELL:
+ r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CRASH_REBOOT:
+ r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_CONFIRM_SPAWN:
+ arg_confirm_spawn = mfree(arg_confirm_spawn);
+
+ r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
+ optarg);
+ break;
+
+ case ARG_SERVICE_WATCHDOGS:
+ r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_SHOW_STATUS:
+ if (optarg) {
+ r = parse_show_status(optarg, &arg_show_status);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
+ optarg);
+ } else
+ arg_show_status = SHOW_STATUS_YES;
+ break;
+
+ case ARG_DESERIALIZE: {
+ int fd;
+ FILE *f;
+
+ fd = parse_fd(optarg);
+ if (fd < 0)
+ return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg);
+
+ (void) fd_cloexec(fd, true);
+
+ f = fdopen(fd, "r");
+ if (!f)
+ return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
+
+ safe_fclose(arg_serialization);
+ arg_serialization = f;
+
+ break;
+ }
+
+ case ARG_SWITCHED_ROOT:
+ arg_switched_root = true;
+ break;
+
+ case ARG_MACHINE_ID:
+ r = id128_from_string_nonzero(optarg, &arg_machine_id);
+ if (r < 0)
+ return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
+ break;
+
+ case 'h':
+ arg_action = ACTION_HELP;
+ break;
+
+ case 'D':
+ log_set_max_level(LOG_DEBUG);
+ break;
+
+ case 'b':
+ case 's':
+ case 'z':
+ /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
+ * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
+ */
+ case '?':
+ if (getpid_cached() != 1)
+ return -EINVAL;
+ else
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (optind < argc && getpid_cached() != 1)
+ /* Hmm, when we aren't run as init system let's complain about excess arguments */
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
+
+ if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Explicit --user argument required to run as user manager.");
+
+ return 0;
+}
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man("systemd", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "%sStarts and monitors system and user services.%s\n\n"
+ "This program takes no positional arguments.\n\n"
+ "%sOptions%s:\n"
+ " -h --help Show this help\n"
+ " --version Show version\n"
+ " --test Determine initial transaction, dump it and exit\n"
+ " --system Combined with --test: operate in system mode\n"
+ " --user Combined with --test: operate in user mode\n"
+ " --dump-configuration-items Dump understood unit configuration items\n"
+ " --dump-bus-properties Dump exposed bus properties\n"
+ " --bus-introspect=PATH Write XML introspection data\n"
+ " --unit=UNIT Set default unit\n"
+ " --dump-core[=BOOL] Dump core on crash\n"
+ " --crash-vt=NR Change to specified VT on crash\n"
+ " --crash-reboot[=BOOL] Reboot on crash\n"
+ " --crash-shell[=BOOL] Run shell on crash\n"
+ " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
+ " --show-status[=BOOL] Show status updates on the console during boot\n"
+ " --log-target=TARGET Set log target (console, journal, kmsg,\n"
+ " journal-or-kmsg, null)\n"
+ " --log-level=LEVEL Set log level (debug, info, notice, warning,\n"
+ " err, crit, alert, emerg)\n"
+ " --log-color[=BOOL] Highlight important log messages\n"
+ " --log-location[=BOOL] Include code location in log messages\n"
+ " --log-time[=BOOL] Prefix log messages with current time\n"
+ " --default-standard-output= Set default standard output for services\n"
+ " --default-standard-error= Set default standard error output for services\n"
+ " --no-pager Do not pipe output into a pager\n"
+ "\nSee the %s for details.\n",
+ program_invocation_short_name,
+ ansi_highlight(),
+ ansi_normal(),
+ ansi_underline(),
+ ansi_normal(),
+ link);
+
+ return 0;
+}
+
+static int prepare_reexecute(
+ Manager *m,
+ FILE **ret_f,
+ FDSet **ret_fds,
+ bool switching_root) {
+
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(m);
+ assert(ret_f);
+ assert(ret_fds);
+
+ r = manager_open_serialization(m, &f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create serialization file: %m");
+
+ /* Make sure nothing is really destructed when we shut down */
+ m->n_reloading++;
+ bus_manager_send_reloading(m, true);
+
+ fds = fdset_new();
+ if (!fds)
+ return log_oom();
+
+ r = manager_serialize(m, f, fds, switching_root);
+ if (r < 0)
+ return r;
+
+ if (fseeko(f, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to rewind serialization fd: %m");
+
+ r = fd_cloexec(fileno(f), false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
+
+ r = fdset_cloexec(fds, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
+
+ *ret_f = TAKE_PTR(f);
+ *ret_fds = TAKE_PTR(fds);
+
+ return 0;
+}
+
+static void bump_file_max_and_nr_open(void) {
+
+ /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
+ * numbers of file descriptors are no longer a performance problem and their memory is properly
+ * tracked by memcg, thus counting them and limiting them in another two layers of limits is
+ * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
+ * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
+
+#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
+ int r;
+#endif
+
+#if BUMP_PROC_SYS_FS_FILE_MAX
+ /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
+ * different, but the operation would fail silently.) */
+ r = sysctl_write("fs/file-max", LONG_MAX_STR);
+ if (r < 0)
+ log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to bump fs.file-max, ignoring: %m");
+#endif
+
+#if BUMP_PROC_SYS_FS_NR_OPEN
+ int v = INT_MAX;
+
+ /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know
+ * what they are. The expression by which the maximum is determined is dependent on the architecture,
+ * and is something we don't really want to copy to userspace, as it is dependent on implementation
+ * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try
+ * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that
+ * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */
+
+ for (;;) {
+ int k;
+
+ v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
+ if (v < 1024) {
+ log_warning("Can't bump fs.nr_open, value too small.");
+ break;
+ }
+
+ k = read_nr_open();
+ if (k < 0) {
+ log_error_errno(k, "Failed to read fs.nr_open: %m");
+ break;
+ }
+ if (k >= v) { /* Already larger */
+ log_debug("Skipping bump, value is already larger.");
+ break;
+ }
+
+ r = sysctl_writef("fs/nr_open", "%i", v);
+ if (r == -EINVAL) {
+ log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
+ v /= 2;
+ continue;
+ }
+ if (r < 0) {
+ log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
+ break;
+ }
+
+ log_debug("Successfully bumped fs.nr_open to %i", v);
+ break;
+ }
+#endif
+}
+
+static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
+ int r, nr;
+
+ /* Get the underlying absolute limit the kernel enforces */
+ nr = read_nr_open();
+
+ /* Calculate the new limits to use for us. Never lower from what we inherited. */
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
+ .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
+ };
+
+ /* Shortcut if nothing changes. */
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
+ log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
+ /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
+ * both hard and soft. */
+ r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
+ if (r < 0)
+ return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
+
+ return 0;
+}
+
+static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
+ uint64_t mm;
+ int r;
+
+ /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
+ * which should normally disable such checks. We need them to implement IPAddressAllow= and
+ * IPAddressDeny=, hence let's bump the value high enough for our user. */
+
+ /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
+ * must be unsigned, hence this is a given, but let's make this clear here. */
+ assert_cc(RLIM_INFINITY > 0);
+
+ mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
+ * physical RAM. We allow an eighth to be locked by us, just to
+ * pick a value. */
+
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
+ .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
+ };
+
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
+ log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
+ r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
+ if (r < 0)
+ return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
+
+ return 0;
+}
+
+static void test_usr(void) {
+
+ /* Check that /usr is either on the same file system as / or mounted already. */
+
+ if (dir_is_empty("/usr", /* ignore_hidden_or_backup= */ false) <= 0)
+ return;
+
+ log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
+ "Some things will probably break (sometimes even silently) in mysterious ways. "
+ "Consult https://www.freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
+}
+
+static int enforce_syscall_archs(Set *archs) {
+#if HAVE_SECCOMP
+ int r;
+
+ if (!is_seccomp_available())
+ return 0;
+
+ r = seccomp_restrict_archs(arg_syscall_archs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
+#endif
+ return 0;
+}
+
+static int os_release_status(void) {
+ _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
+ *ansi_color = NULL, *support_end = NULL;
+ int r;
+
+ r = parse_os_release(NULL,
+ "PRETTY_NAME", &pretty_name,
+ "NAME", &name,
+ "VERSION", &version,
+ "ANSI_COLOR", &ansi_color,
+ "SUPPORT_END", &support_end);
+ if (r < 0)
+ return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to read os-release file, ignoring: %m");
+
+ const char *label = os_release_pretty_name(pretty_name, name);
+
+ if (show_status_on(arg_show_status)) {
+ if (log_get_show_color())
+ status_printf(NULL, 0,
+ "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
+ empty_to_null(ansi_color) ?: "1",
+ label);
+ else
+ status_printf(NULL, 0,
+ "\nWelcome to %s!\n",
+ label);
+ }
+
+ if (support_end && os_release_support_ended(support_end, /* quiet */ false, NULL) > 0)
+ /* pretty_name may include the version already, so we'll print the version only if we
+ * have it and we're not using pretty_name. */
+ status_printf(ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, 0,
+ "This OS version (%s%s%s) is past its end-of-support date (%s)",
+ label,
+ (pretty_name || !version) ? "" : " version ",
+ (pretty_name || !version) ? "" : version,
+ support_end);
+
+ return 0;
+}
+
+static int setup_os_release(RuntimeScope scope) {
+ _cleanup_free_ char *os_release_dst = NULL;
+ const char *os_release_src = "/etc/os-release";
+ int r;
+
+ if (access("/etc/os-release", F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m");
+
+ os_release_src = "/usr/lib/os-release";
+ }
+
+ if (scope == RUNTIME_SCOPE_SYSTEM) {
+ os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release");
+ if (!os_release_dst)
+ return log_oom_debug();
+ } else {
+ if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0)
+ return log_oom_debug();
+ }
+
+ r = mkdir_parents_label(os_release_dst, 0755);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst);
+
+ r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst);
+
+ return 0;
+}
+
+static int write_container_id(void) {
+ const char *c;
+ int r = 0; /* avoid false maybe-uninitialized warning */
+
+ c = getenv("container");
+ if (isempty(c))
+ return 0;
+
+ WITH_UMASK(0022)
+ r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
+
+ return 1;
+}
+
+static int bump_unix_max_dgram_qlen(void) {
+ _cleanup_free_ char *qlen = NULL;
+ unsigned long v;
+ int r;
+
+ /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
+ * the value really really early during boot, so that it is actually applied to all our sockets,
+ * including the $NOTIFY_SOCKET one. */
+
+ r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
+ if (r < 0)
+ return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to read AF_UNIX datagram queue length, ignoring: %m");
+
+ r = safe_atolu(qlen, &v);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
+
+ if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
+ return 0;
+
+ r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN));
+ if (r < 0)
+ return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
+
+ return 1;
+}
+
+static int fixup_environment(void) {
+ _cleanup_free_ char *term = NULL;
+ const char *t;
+ int r;
+
+ /* Only fix up the environment when we are started as PID 1 */
+ if (getpid_cached() != 1)
+ return 0;
+
+ /* We expect the environment to be set correctly if run inside a container. */
+ if (detect_container() > 0)
+ return 0;
+
+ /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
+ * backend device used by the console. We try to make a better guess here since some consoles might
+ * not have support for color mode for example.
+ *
+ * However if TERM was configured through the kernel command line then leave it alone. */
+ r = proc_cmdline_get_key("TERM", 0, &term);
+ if (r < 0)
+ return r;
+
+ if (r == 0) {
+ r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term);
+ if (r < 0)
+ return r;
+ }
+
+ t = term ?: default_term_for_tty("/dev/console");
+
+ if (setenv("TERM", t, 1) < 0)
+ return -errno;
+
+ /* The kernels sets HOME=/ for init. Let's undo this. */
+ if (path_equal_ptr(getenv("HOME"), "/"))
+ assert_se(unsetenv("HOME") == 0);
+
+ return 0;
+}
+
+static void redirect_telinit(int argc, char *argv[]) {
+
+ /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
+
+#if HAVE_SYSV_COMPAT
+ if (getpid_cached() == 1)
+ return;
+
+ if (!invoked_as(argv, "init"))
+ return;
+
+ execv(SYSTEMCTL_BINARY_PATH, argv);
+ log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
+ exit(EXIT_FAILURE);
+#endif
+}
+
+static int become_shutdown(int objective, int retval) {
+ static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
+ [MANAGER_EXIT] = "exit",
+ [MANAGER_REBOOT] = "reboot",
+ [MANAGER_POWEROFF] = "poweroff",
+ [MANAGER_HALT] = "halt",
+ [MANAGER_KEXEC] = "kexec",
+ };
+
+ char log_level[STRLEN("--log-level=") + DECIMAL_STR_MAX(int)],
+ timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
+ exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
+
+ _cleanup_strv_free_ char **env_block = NULL;
+ usec_t watchdog_timer = 0;
+ int r;
+
+ assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
+ assert(table[objective]);
+
+ xsprintf(log_level, "--log-level=%d", log_get_max_level());
+ xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
+
+ const char* command_line[10] = {
+ SYSTEMD_SHUTDOWN_BINARY_PATH,
+ table[objective],
+ log_level,
+ timeout,
+ /* Note that the last position is a terminator and must contain NULL. */
+ };
+ size_t pos = 4;
+
+ assert(command_line[pos-1]);
+ assert(!command_line[pos]);
+
+ switch (log_get_target()) {
+
+ case LOG_TARGET_KMSG:
+ case LOG_TARGET_JOURNAL_OR_KMSG:
+ case LOG_TARGET_SYSLOG_OR_KMSG:
+ command_line[pos++] = "--log-target=kmsg";
+ break;
+
+ case LOG_TARGET_NULL:
+ command_line[pos++] = "--log-target=null";
+ break;
+
+ case LOG_TARGET_CONSOLE:
+ default:
+ command_line[pos++] = "--log-target=console";
+ break;
+ };
+
+ if (log_get_show_color())
+ command_line[pos++] = "--log-color";
+
+ if (log_get_show_location())
+ command_line[pos++] = "--log-location";
+
+ if (log_get_show_time())
+ command_line[pos++] = "--log-time";
+
+ xsprintf(exit_code, "--exit-code=%d", retval);
+ command_line[pos++] = exit_code;
+
+ assert(pos < ELEMENTSOF(command_line));
+
+ /* The watchdog: */
+
+ if (objective == MANAGER_REBOOT)
+ watchdog_timer = arg_reboot_watchdog;
+ else if (objective == MANAGER_KEXEC)
+ watchdog_timer = arg_kexec_watchdog;
+
+ /* If we reboot or kexec let's set the shutdown watchdog and tell the
+ * shutdown binary to repeatedly ping it.
+ * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
+ (void) watchdog_setup_pretimeout(0);
+ (void) watchdog_setup_pretimeout_governor(NULL);
+ r = watchdog_setup(watchdog_timer);
+ watchdog_close(r < 0);
+
+ /* The environment block: */
+
+ env_block = strv_copy(environ);
+
+ /* Tell the binary how often to ping, ignore failure */
+ (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
+
+ if (arg_watchdog_device)
+ (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
+
+ /* Avoid the creation of new processes forked by the kernel; at this
+ * point, we will not listen to the signals anyway */
+ if (detect_container() <= 0)
+ (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
+
+ execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
+ return -errno;
+}
+
+static void initialize_clock(void) {
+ int r;
+
+ /* This is called very early on, before we parse the kernel command line or otherwise figure out why
+ * we are running, but only once. */
+
+ if (clock_is_localtime(NULL) > 0) {
+ int min;
+
+ /* The very first call of settimeofday() also does a time warp in the kernel.
+ *
+ * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
+ * take care of maintaining the RTC and do all adjustments. This matches the behavior of
+ * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
+ */
+ r = clock_set_timezone(&min);
+ if (r < 0)
+ log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
+ else
+ log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
+
+ } else if (!in_initrd())
+ /*
+ * Do a dummy very first call to seal the kernel's time warp magic.
+ *
+ * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
+ * LOCAL, but the real system could be set up that way. In such case, we need to delay the
+ * time-warp or the sealing until we reach the real system.
+ *
+ * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
+ * the time will jump or be incorrect at every daylight saving time change. All kernel local
+ * time concepts will be treated as UTC that way.
+ */
+ (void) clock_reset_timewarp();
+
+ ClockChangeDirection change_dir;
+ r = clock_apply_epoch(&change_dir);
+ if (r > 0 && change_dir == CLOCK_CHANGE_FORWARD)
+ log_info("System time before build time, advancing clock.");
+ else if (r > 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+ log_info("System time is further ahead than %s after build time, resetting clock to build time.",
+ FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+ else if (r < 0 && change_dir == CLOCK_CHANGE_FORWARD)
+ log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
+ else if (r < 0 && change_dir == CLOCK_CHANGE_BACKWARD)
+ log_error_errno(r, "Current system time is further ahead %s after build time, but cannot correct: %m",
+ FORMAT_TIMESPAN(CLOCK_VALID_RANGE_USEC_MAX, USEC_PER_DAY));
+}
+
+static void apply_clock_update(void) {
+ /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
+ * command line and such. */
+
+ if (arg_clock_usec == 0)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
+ log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
+ else
+ log_info("Set system clock to %s, as specified on the kernel command line.",
+ FORMAT_TIMESTAMP(arg_clock_usec));
+}
+
+static void cmdline_take_random_seed(void) {
+ size_t suggested;
+ int r;
+
+ if (arg_random_seed_size == 0)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ assert(arg_random_seed);
+ suggested = random_pool_size();
+
+ if (arg_random_seed_size < suggested)
+ log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
+ arg_random_seed_size, suggested);
+
+ r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
+ return;
+ }
+
+ log_notice("Successfully credited entropy passed on kernel command line.\n"
+ "Note that the seed provided this way is accessible to unprivileged programs. "
+ "This functionality should not be used outside of testing environments.");
+}
+
+static void initialize_coredump(bool skip_setup) {
+ if (getpid_cached() != 1)
+ return;
+
+ /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
+ * the limit) will process core dumps for system services by default. */
+ if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
+ log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
+
+ /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
+ * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
+ * command line later so core dumps can still be generated during early startup and in initrd. */
+ if (!skip_setup)
+ disable_coredumps();
+}
+
+static void initialize_core_pattern(bool skip_setup) {
+ int r;
+
+ if (skip_setup || !arg_early_core_pattern)
+ return;
+
+ if (getpid_cached() != 1)
+ return;
+
+ r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
+ arg_early_core_pattern);
+}
+
+static void update_cpu_affinity(bool skip_setup) {
+ _cleanup_free_ char *mask = NULL;
+
+ if (skip_setup || !arg_cpu_affinity.set)
+ return;
+
+ assert(arg_cpu_affinity.allocated > 0);
+
+ mask = cpu_set_to_range_string(&arg_cpu_affinity);
+ log_debug("Setting CPU affinity to {%s}.", strnull(mask));
+
+ if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
+ log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
+}
+
+static void update_numa_policy(bool skip_setup) {
+ int r;
+ _cleanup_free_ char *nodes = NULL;
+ const char * policy = NULL;
+
+ if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
+ return;
+
+ if (DEBUG_LOGGING) {
+ policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
+ nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
+ log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
+ }
+
+ r = apply_numa_policy(&arg_numa_policy);
+ if (r == -EOPNOTSUPP)
+ log_debug_errno(r, "NUMA support not available, ignoring.");
+ else if (r < 0)
+ log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
+}
+
+static void filter_args(
+ const char* dst[],
+ size_t *dst_index,
+ char **src,
+ int argc) {
+
+ assert(dst);
+ assert(dst_index);
+
+ /* Copy some filtered arguments into the dst array from src. */
+ for (int i = 1; i < argc; i++) {
+ if (STR_IN_SET(src[i],
+ "--switched-root",
+ "--system",
+ "--user"))
+ continue;
+
+ if (startswith(src[i], "--deserialize="))
+ continue;
+ if (streq(src[i], "--deserialize")) {
+ i++; /* Skip the argument too */
+ continue;
+ }
+
+ /* Skip target unit designators. We already acted upon this information and have queued
+ * appropriate jobs. We don't want to redo all this after reexecution. */
+ if (startswith(src[i], "--unit="))
+ continue;
+ if (streq(src[i], "--unit")) {
+ i++; /* Skip the argument too */
+ continue;
+ }
+
+ /* Seems we have a good old option. Let's pass it over to the new instance. */
+ dst[(*dst_index)++] = src[i];
+ }
+}
+
+static void finish_remaining_processes(ManagerObjective objective) {
+ assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
+
+ /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
+ * SIGCHLD for them after deserializing. */
+ if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+ broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
+
+ /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
+ * of units that were configured with SurviveFinalKillSignal=yes. */
+ if (objective == MANAGER_SOFT_REBOOT)
+ broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
+}
+
+static int do_reexecute(
+ ManagerObjective objective,
+ int argc,
+ char* argv[],
+ const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock,
+ FDSet *fds,
+ const char *switch_root_dir,
+ const char *switch_root_init,
+ const char **ret_error_message) {
+
+ size_t i, args_size;
+ const char **args;
+ int r;
+
+ assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT));
+ assert(argc >= 0);
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+ assert(ret_error_message);
+
+ if (switch_root_init) {
+ r = chase(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to chase configured init %s/%s: %m",
+ strempty(switch_root_dir), switch_root_init);
+ } else {
+ r = chase(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to chase our own binary %s/%s: %m",
+ strempty(switch_root_dir), SYSTEMD_BINARY_PATH);
+ }
+
+ if (r < 0) {
+ r = chase("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to chase %s/sbin/init", strempty(switch_root_dir));
+ }
+
+ /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get
+ * rebooted while we do that */
+ watchdog_close(true);
+
+ /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
+ * the kernel default to its child processes */
+ if (saved_rlimit_nofile->rlim_cur != 0)
+ (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
+ if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
+ (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
+
+ finish_remaining_processes(objective);
+
+ if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) {
+ /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */
+ r = path_is_os_tree("/run/nextroot");
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m");
+ else if (r > 0)
+ switch_root_dir = "/run/nextroot";
+ }
+
+ if (switch_root_dir) {
+ r = switch_root(/* new_root= */ switch_root_dir,
+ /* old_root_after= */ NULL,
+ /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) |
+ (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN));
+ if (r < 0)
+ log_error_errno(r, "Failed to switch root, trying to continue: %m");
+ }
+
+ args_size = argc + 5;
+ args = newa(const char*, args_size);
+
+ if (!switch_root_init) {
+ char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
+
+ /* First try to spawn ourselves with the right path, and with full serialization. We do this
+ * only if the user didn't specify an explicit init to spawn. */
+
+ assert(arg_serialization);
+ assert(fds);
+
+ xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
+
+ i = 1; /* Leave args[0] empty for now. */
+
+ /* Put our stuff first to make sure it always gets parsed in case
+ * we get weird stuff from the kernel cmdline (like --) */
+ if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+ args[i++] = "--switched-root";
+ args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope);
+ args[i++] = sfd;
+
+ filter_args(args, &i, argv, argc);
+
+ args[i++] = NULL;
+
+ assert(i <= args_size);
+
+ /*
+ * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
+ * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
+ * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
+ * and wait() for it in the parent, before proceeding into the exec().
+ */
+ valgrind_summary_hack();
+
+ args[0] = SYSTEMD_BINARY_PATH;
+ (void) execv(args[0], (char* const*) args);
+
+ if (objective == MANAGER_REEXECUTE) {
+ *ret_error_message = "Failed to execute our own binary";
+ return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
+ }
+
+ log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
+ }
+
+ /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
+ * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
+ * but let's hope that doesn't matter.) */
+
+ arg_serialization = safe_fclose(arg_serialization);
+ fds = fdset_free(fds);
+
+ /* Reopen the console */
+ (void) make_console_stdio();
+
+ i = 1; /* Leave args[0] empty for now. */
+ for (int j = 1; j <= argc; j++)
+ args[i++] = argv[j];
+ assert(i <= args_size);
+
+ /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
+ (void) reset_all_signal_handlers();
+ (void) reset_signal_mask();
+ (void) rlimit_nofile_safe();
+
+ if (switch_root_init) {
+ args[0] = switch_root_init;
+ (void) execve(args[0], (char* const*) args, saved_env);
+ log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
+ }
+
+ args[0] = "/sbin/init";
+ (void) execv(args[0], (char* const*) args);
+ r = -errno;
+
+ manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
+ "Failed to execute /sbin/init");
+
+ *ret_error_message = "Failed to execute fallback shell";
+ if (r == -ENOENT) {
+ log_warning("No /sbin/init, trying fallback");
+
+ args[0] = "/bin/sh";
+ args[1] = NULL;
+ (void) execve(args[0], (char* const*) args, saved_env);
+ return log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
+ } else
+ return log_error_errno(r, "Failed to execute /sbin/init, giving up: %m");
+}
+
+static int invoke_main_loop(
+ Manager *m,
+ const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock,
+ int *ret_retval, /* Return parameters relevant for shutting down */
+ FDSet **ret_fds, /* Return parameters for reexecuting */
+ char **ret_switch_root_dir, /* … */
+ char **ret_switch_root_init, /* … */
+ const char **ret_error_message) {
+
+ int r;
+
+ assert(m);
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+ assert(ret_retval);
+ assert(ret_fds);
+ assert(ret_switch_root_dir);
+ assert(ret_switch_root_init);
+ assert(ret_error_message);
+
+ for (;;) {
+ int objective = manager_loop(m);
+ if (objective < 0) {
+ *ret_error_message = "Failed to run main loop";
+ return log_struct_errno(LOG_EMERG, objective,
+ LOG_MESSAGE("Failed to run main loop: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_MAINLOOP_FAILED_STR);
+ }
+
+ switch (objective) {
+
+ case MANAGER_RELOAD: {
+ LogTarget saved_log_target;
+ int saved_log_level;
+
+ manager_send_reloading(m);
+
+ log_info("Reloading...");
+
+ /* First, save any overridden log level/target, then parse the configuration file,
+ * which might change the log level to new settings. */
+
+ saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
+ saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
+
+ (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
+
+ set_manager_defaults(m);
+ set_manager_settings(m);
+
+ update_cpu_affinity(false);
+ update_numa_policy(false);
+
+ if (saved_log_level >= 0)
+ manager_override_log_level(m, saved_log_level);
+ if (saved_log_target >= 0)
+ manager_override_log_target(m, saved_log_target);
+
+ if (manager_reload(m) < 0)
+ /* Reloading failed before the point of no return.
+ * Let's continue running as if nothing happened. */
+ m->objective = MANAGER_OK;
+ else
+ log_info("Reloading finished in " USEC_FMT " ms.",
+ usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC);
+
+ continue;
+ }
+
+ case MANAGER_REEXECUTE:
+
+ manager_send_reloading(m); /* From the perspective of the manager calling us this is
+ * pretty much the same as a reload */
+
+ r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
+ if (r < 0) {
+ *ret_error_message = "Failed to prepare for reexecution";
+ return r;
+ }
+
+ log_notice("Reexecuting.");
+
+ *ret_retval = EXIT_SUCCESS;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+
+ case MANAGER_SWITCH_ROOT:
+
+ manager_send_reloading(m); /* From the perspective of the manager calling us this is
+ * pretty much the same as a reload */
+
+ manager_set_switching_root(m, true);
+
+ if (!m->switch_root_init) {
+ r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
+ if (r < 0) {
+ *ret_error_message = "Failed to prepare for reexecution";
+ return r;
+ }
+ } else
+ *ret_fds = NULL;
+
+ log_notice("Switching root.");
+
+ *ret_retval = EXIT_SUCCESS;
+
+ /* Steal the switch root parameters */
+ *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+ *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
+
+ return objective;
+
+ case MANAGER_SOFT_REBOOT:
+ manager_send_reloading(m);
+ manager_set_switching_root(m, true);
+
+ r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true);
+ if (r < 0) {
+ *ret_error_message = "Failed to prepare for reexecution";
+ return r;
+ }
+
+ log_notice("Soft-rebooting.");
+
+ *ret_retval = EXIT_SUCCESS;
+ *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+ *ret_switch_root_init = NULL;
+
+ return objective;
+
+ case MANAGER_EXIT:
+ if (MANAGER_IS_USER(m)) {
+ log_debug("Exit.");
+
+ *ret_retval = m->return_value;
+ *ret_fds = NULL;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+ }
+
+ _fallthrough_;
+ case MANAGER_REBOOT:
+ case MANAGER_POWEROFF:
+ case MANAGER_HALT:
+ case MANAGER_KEXEC: {
+ log_notice("Shutting down.");
+
+ *ret_retval = m->return_value;
+ *ret_fds = NULL;
+ *ret_switch_root_dir = *ret_switch_root_init = NULL;
+
+ return objective;
+ }
+
+ default:
+ assert_not_reached();
+ }
+ }
+}
+
+static void log_execution_mode(bool *ret_first_boot) {
+ bool first_boot = false;
+ int r;
+
+ assert(ret_first_boot);
+
+ switch (arg_runtime_scope) {
+
+ case RUNTIME_SCOPE_SYSTEM: {
+ struct utsname uts;
+ int v;
+
+ log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
+ arg_action == ACTION_TEST ? "test " : "",
+ systemd_features);
+
+ v = detect_virtualization();
+ if (v > 0)
+ log_info("Detected virtualization %s.", virtualization_to_string(v));
+
+ v = detect_confidential_virtualization();
+ if (v > 0)
+ log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v));
+
+ log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
+
+ if (in_initrd())
+ log_info("Running in initrd.");
+ else {
+ _cleanup_free_ char *id_text = NULL;
+
+ /* Let's check whether we are in first boot. First, check if an override was
+ * specified on the kernel command line. If yes, we honour that. */
+
+ r = proc_cmdline_get_bool("systemd.condition-first-boot", /* flags = */ 0, &first_boot);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse systemd.condition-first-boot= kernel command line argument, ignoring: %m");
+
+ if (r > 0)
+ log_full(first_boot ? LOG_INFO : LOG_DEBUG,
+ "Kernel command line argument says we are %s first boot.",
+ first_boot ? "in" : "not in");
+ else {
+ /* Second, perform autodetection. We use /etc/machine-id as flag file for
+ * this: If it is missing or contains the value "uninitialized", this is the
+ * first boot. In other cases, it is not. This allows container managers and
+ * installers to provision a couple of files in /etc but still permit the
+ * first-boot initialization to occur. If the container manager wants to
+ * provision the machine ID it should pass $container_uuid to PID 1. */
+
+ r = read_one_line_file("/etc/machine-id", &id_text);
+ if (r < 0 || streq(id_text, "uninitialized")) {
+ if (r < 0 && r != -ENOENT)
+ log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
+
+ first_boot = true;
+ log_info("Detected first boot.");
+ } else
+ log_debug("Detected initialized system, this is not the first boot.");
+ }
+ }
+
+ assert_se(uname(&uts) >= 0);
+
+ if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+ log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
+ "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
+ else
+ log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
+
+ break;
+ }
+
+ case RUNTIME_SCOPE_USER:
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *t = NULL;
+
+ t = uid_to_name(getuid());
+ log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
+ arg_action == ACTION_TEST ? " test" : "",
+ getuid(), strna(t), systemd_features);
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ *ret_first_boot = first_boot;
+}
+
+static int initialize_runtime(
+ bool skip_setup,
+ bool first_boot,
+ struct rlimit *saved_rlimit_nofile,
+ struct rlimit *saved_rlimit_memlock,
+ const char **ret_error_message) {
+ int r;
+
+ assert(ret_error_message);
+
+ /* Sets up various runtime parameters. Many of these initializations are conditionalized:
+ *
+ * - Some only apply to --system instances
+ * - Some only apply to --user instances
+ * - Some only apply when we first start up, but not when we reexecute
+ */
+
+ if (arg_action != ACTION_RUN)
+ return 0;
+
+ update_cpu_affinity(skip_setup);
+ update_numa_policy(skip_setup);
+
+ switch (arg_runtime_scope) {
+
+ case RUNTIME_SCOPE_SYSTEM:
+ /* Make sure we leave a core dump without panicking the kernel. */
+ install_crash_handler();
+
+ if (!skip_setup) {
+ r = mount_cgroup_controllers();
+ if (r < 0) {
+ *ret_error_message = "Failed to mount cgroup hierarchies";
+ return r;
+ }
+
+ /* Pull credentials from various sources into a common credential directory (we do
+ * this here, before setting up the machine ID, so that we can use credential info
+ * for setting up the machine ID) */
+ (void) import_credentials();
+
+ (void) os_release_status();
+ (void) hostname_setup(true);
+ /* Force transient machine-id on first boot. */
+ machine_id_setup(/* root= */ NULL, /* force_transient= */ first_boot, arg_machine_id, /* ret_machine_id */ NULL);
+ (void) loopback_setup();
+ bump_unix_max_dgram_qlen();
+ bump_file_max_and_nr_open();
+ test_usr();
+ write_container_id();
+
+ /* Copy os-release to the propagate directory, so that we update it for services running
+ * under RootDirectory=/RootImage= when we do a soft reboot. */
+ r = setup_os_release(RUNTIME_SCOPE_SYSTEM);
+ if (r < 0)
+ log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
+ }
+
+ r = watchdog_set_device(arg_watchdog_device);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
+
+ break;
+
+ case RUNTIME_SCOPE_USER: {
+ _cleanup_free_ char *p = NULL;
+
+ /* Create the runtime directory and place the inaccessible device nodes there, if we run in
+ * user mode. In system mode mount_setup() already did that. */
+
+ r = xdg_user_runtime_dir(&p, "/systemd");
+ if (r < 0) {
+ *ret_error_message = "$XDG_RUNTIME_DIR is not set";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR);
+ }
+
+ (void) mkdir_p_label(p, 0755);
+ (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
+ r = setup_os_release(RUNTIME_SCOPE_USER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
+ break;
+ }
+
+ default:
+ assert_not_reached();
+ }
+
+ if (arg_timer_slack_nsec != NSEC_INFINITY)
+ if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
+ log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
+
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+
+ if (!cap_test_all(arg_capability_bounding_set)) {
+ r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
+ if (r < 0) {
+ *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR);
+ }
+
+ r = capability_bounding_set_drop(arg_capability_bounding_set, true);
+ if (r < 0) {
+ *ret_error_message = "Failed to drop capability bounding set";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to drop capability bounding set: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR);
+ }
+ }
+
+ if (arg_no_new_privs) {
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+ *ret_error_message = "Failed to disable new privileges";
+ return log_struct_errno(LOG_EMERG, errno,
+ LOG_MESSAGE("Failed to disable new privileges: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR);
+ }
+ }
+ }
+
+ if (arg_syscall_archs) {
+ r = enforce_syscall_archs(arg_syscall_archs);
+ if (r < 0) {
+ *ret_error_message = "Failed to set syscall architectures";
+ return r;
+ }
+ }
+
+ r = make_reaper_process(true);
+ if (r < 0)
+ log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
+
+ /* Bump up RLIMIT_NOFILE for systemd itself */
+ (void) bump_rlimit_nofile(saved_rlimit_nofile);
+ (void) bump_rlimit_memlock(saved_rlimit_memlock);
+
+ return 0;
+}
+
+static int do_queue_default_job(
+ Manager *m,
+ const char **ret_error_message) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ const char *unit;
+ Job *job;
+ Unit *target;
+ int r;
+
+ if (arg_default_unit)
+ unit = arg_default_unit;
+ else if (in_initrd())
+ unit = SPECIAL_INITRD_TARGET;
+ else
+ unit = SPECIAL_DEFAULT_TARGET;
+
+ log_debug("Activating default unit: %s", unit);
+
+ r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
+ if (r < 0 && in_initrd() && !arg_default_unit) {
+ /* Fall back to default.target, which we used to always use by default. Only do this if no
+ * explicit configuration was given. */
+
+ log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
+
+ r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
+ }
+ if (r < 0) {
+ log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
+
+ r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
+ if (r < 0) {
+ *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
+ : "Failed to load " SPECIAL_RESCUE_TARGET;
+ return r;
+ }
+ }
+
+ assert(target->load_state == UNIT_LOADED);
+
+ r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
+ if (r == -EPERM) {
+ log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
+
+ sd_bus_error_free(&error);
+
+ r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0) {
+ *ret_error_message = "Failed to start default target";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_START_TARGET_FAILED_STR);
+ }
+
+ } else if (r < 0) {
+ *ret_error_message = "Failed to isolate default target";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR);
+ } else
+ log_info("Queued %s job for default target %s.",
+ job_type_to_string(job->type),
+ unit_status_string(job->unit, NULL));
+
+ m->default_unit_job_id = job->id;
+
+ return 0;
+}
+
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
+ struct rlimit *saved_rlimit_memlock) {
+
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+
+ if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
+ log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
+
+ if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
+ log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
+}
+
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
+ struct rlimit *rl;
+
+ if (arg_defaults.rlimit[RLIMIT_NOFILE])
+ return;
+
+ /* Make sure forked processes get limits based on the original kernel setting */
+
+ rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
+ if (!rl) {
+ log_oom();
+ return;
+ }
+
+ /* Bump the hard limit for system services to a substantially higher value. The default
+ * hard limit current kernels set is pretty low (4K), mostly for historical
+ * reasons. According to kernel developers, the fd handling in recent kernels has been
+ * optimized substantially enough, so that we can bump the limit now, without paying too
+ * high a price in memory or performance. Note however that we only bump the hard limit,
+ * not the soft limit. That's because select() works the way it works, and chokes on fds
+ * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
+ * unexpecting programs that they get fds higher than what they can process using
+ * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
+ * this pitfall: programs that are written by folks aware of the select() problem in mind
+ * (and thus use poll()/epoll instead of select(), the way everybody should) can
+ * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
+ * we pass. */
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ int nr;
+
+ /* Get the underlying absolute limit the kernel enforces */
+ nr = read_nr_open();
+
+ rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
+ }
+
+ /* If for some reason we were invoked with a soft limit above 1024 (which should never
+ * happen!, but who knows what we get passed in from pam_limit when invoked as --user
+ * instance), then lower what we pass on to not confuse our children */
+ rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
+
+ arg_defaults.rlimit[RLIMIT_NOFILE] = rl;
+}
+
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
+ struct rlimit *rl;
+
+ /* Pass the original value down to invoked processes */
+
+ if (arg_defaults.rlimit[RLIMIT_MEMLOCK])
+ return;
+
+ rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
+ if (!rl) {
+ log_oom();
+ return;
+ }
+
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
+ * default for this since kernel 5.16) */
+ rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+ rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
+ }
+
+ arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl;
+}
+
+static void setenv_manager_environment(void) {
+ int r;
+
+ STRV_FOREACH(p, arg_manager_environment) {
+ log_debug("Setting '%s' in our own environment.", *p);
+
+ r = putenv_dup(*p, true);
+ if (r < 0)
+ log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
+ }
+}
+
+static void reset_arguments(void) {
+ /* Frees/resets arg_* variables, with a few exceptions commented below. */
+
+ arg_default_unit = mfree(arg_default_unit);
+
+ /* arg_runtime_scope — ignore */
+
+ arg_dump_core = true;
+ arg_crash_chvt = -1;
+ arg_crash_shell = false;
+ arg_crash_reboot = false;
+ arg_confirm_spawn = mfree(arg_confirm_spawn);
+ arg_show_status = _SHOW_STATUS_INVALID;
+ arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
+ arg_switched_root = false;
+ arg_pager_flags = 0;
+ arg_service_watchdogs = true;
+
+ unit_defaults_done(&arg_defaults);
+ unit_defaults_init(&arg_defaults, arg_runtime_scope);
+
+ arg_runtime_watchdog = 0;
+ arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
+ arg_kexec_watchdog = 0;
+ arg_pretimeout_watchdog = 0;
+ arg_early_core_pattern = mfree(arg_early_core_pattern);
+ arg_watchdog_device = mfree(arg_watchdog_device);
+ arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
+
+ arg_default_environment = strv_free(arg_default_environment);
+ arg_manager_environment = strv_free(arg_manager_environment);
+
+ arg_capability_bounding_set = CAP_MASK_UNSET;
+ arg_no_new_privs = false;
+ arg_timer_slack_nsec = NSEC_INFINITY;
+
+ arg_syscall_archs = set_free(arg_syscall_archs);
+
+ /* arg_serialization — ignore */
+
+ arg_machine_id = (sd_id128_t) {};
+ arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+
+ cpu_set_reset(&arg_cpu_affinity);
+ numa_policy_reset(&arg_numa_policy);
+
+ arg_random_seed = mfree(arg_random_seed);
+ arg_random_seed_size = 0;
+ arg_clock_usec = 0;
+
+ arg_reload_limit_interval_sec = 0;
+ arg_reload_limit_burst = 0;
+}
+
+static void determine_default_oom_score_adjust(void) {
+ int r, a, b;
+
+ /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
+ * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
+ * user). */
+
+ if (arg_defaults.oom_score_adjust_set)
+ return;
+
+ if (getuid() == 0)
+ return;
+
+ r = get_oom_score_adjust(&a);
+ if (r < 0)
+ return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
+
+ assert_cc(100 <= OOM_SCORE_ADJ_MAX);
+ b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
+
+ if (a == b)
+ return;
+
+ arg_defaults.oom_score_adjust = b;
+ arg_defaults.oom_score_adjust_set = true;
+}
+
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+ const struct rlimit *saved_rlimit_memlock) {
+ int r;
+
+ assert(saved_rlimit_nofile);
+ assert(saved_rlimit_memlock);
+
+ /* Assign configuration defaults */
+ reset_arguments();
+
+ r = parse_config_file();
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse config file, ignoring: %m");
+
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
+ r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
+ }
+
+ /* Initialize some default rlimits for services if they haven't been configured */
+ fallback_rlimit_nofile(saved_rlimit_nofile);
+ fallback_rlimit_memlock(saved_rlimit_memlock);
+
+ /* Note that this also parses bits from the kernel command line, including "debug". */
+ log_parse_environment();
+
+ /* Initialize the show status setting if it hasn't been set explicitly yet */
+ if (arg_show_status == _SHOW_STATUS_INVALID)
+ arg_show_status = SHOW_STATUS_YES;
+
+ /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
+ determine_default_oom_score_adjust();
+
+ /* Push variables into the manager environment block */
+ setenv_manager_environment();
+
+ /* Parse log environment variables again to take into account any new environment variables. */
+ log_parse_environment();
+
+ return 0;
+}
+
+static int safety_checks(void) {
+
+ if (getpid_cached() == 1 &&
+ arg_action != ACTION_RUN)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Unsupported execution mode while PID 1.");
+
+ if (getpid_cached() == 1 &&
+ arg_runtime_scope == RUNTIME_SCOPE_USER)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Can't run --user mode as PID 1.");
+
+ if (arg_action == ACTION_RUN &&
+ arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
+ getpid_cached() != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Can't run system mode unless PID 1.");
+
+ if (arg_action == ACTION_TEST &&
+ geteuid() == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Don't run test mode as root.");
+
+ switch (arg_runtime_scope) {
+
+ case RUNTIME_SCOPE_USER:
+
+ if (arg_action == ACTION_RUN &&
+ sd_booted() <= 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Trying to run as user instance, but the system has not been booted with systemd.");
+
+ if (arg_action == ACTION_RUN &&
+ !getenv("XDG_RUNTIME_DIR"))
+ return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
+ "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
+
+ break;
+
+ case RUNTIME_SCOPE_SYSTEM:
+ if (arg_action == ACTION_RUN &&
+ running_in_chroot() > 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Cannot be run in a chroot() environment.");
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int initialize_security(
+ bool *loaded_policy,
+ dual_timestamp *security_start_timestamp,
+ dual_timestamp *security_finish_timestamp,
+ const char **ret_error_message) {
+
+ int r;
+
+ assert(loaded_policy);
+ assert(security_start_timestamp);
+ assert(security_finish_timestamp);
+ assert(ret_error_message);
+
+ dual_timestamp_now(security_start_timestamp);
+
+ r = mac_selinux_setup(loaded_policy);
+ if (r < 0) {
+ *ret_error_message = "Failed to load SELinux policy";
+ return r;
+ }
+
+ r = mac_smack_setup(loaded_policy);
+ if (r < 0) {
+ *ret_error_message = "Failed to load SMACK policy";
+ return r;
+ }
+
+ r = mac_apparmor_setup();
+ if (r < 0) {
+ *ret_error_message = "Failed to load AppArmor policy";
+ return r;
+ }
+
+ r = ima_setup();
+ if (r < 0) {
+ *ret_error_message = "Failed to load IMA policy";
+ return r;
+ }
+
+ dual_timestamp_now(security_finish_timestamp);
+ return 0;
+}
+
+static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
+ int r;
+
+ assert(ret_fds);
+ assert(ret_error_message);
+
+ /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
+ * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
+ * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
+ * mechanism to distinguish passed in fds from our own.
+ *
+ * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
+ * process behind our back. We should not take possession of that (and then accidentally close
+ * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
+ r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
+ if (r < 0) {
+ *ret_error_message = "Failed to allocate fd set";
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to allocate fd set: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_FD_SET_FAILED_STR);
+ }
+
+ /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
+ assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
+
+ return 0;
+}
+
+static void setup_console_terminal(bool skip_setup) {
+
+ if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM)
+ return;
+
+ /* Become a session leader if we aren't one yet. */
+ (void) setsid();
+
+ /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
+ * controlling tty. */
+ (void) release_terminal();
+
+ /* Reset the console, but only if this is really init and we are freshly booted */
+ if (getpid_cached() == 1 && !skip_setup)
+ (void) console_setup();
+}
+
+static bool early_skip_setup_check(int argc, char *argv[]) {
+ bool found_deserialize = false;
+
+ /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
+ * later, so let's just have a quick peek here. Note that if we have switched root, do all the
+ * special setup things anyway, even if in that case we also do deserialization. */
+
+ for (int i = 1; i < argc; i++)
+ if (streq(argv[i], "--switched-root"))
+ return false; /* If we switched root, don't skip the setup. */
+ else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
+ found_deserialize = true;
+
+ return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
+}
+
+static int save_env(void) {
+ char **l;
+
+ l = strv_copy(environ);
+ if (!l)
+ return -ENOMEM;
+
+ strv_free_and_replace(saved_env, l);
+ return 0;
+}
+
+int main(int argc, char *argv[]) {
+ dual_timestamp
+ initrd_timestamp = DUAL_TIMESTAMP_NULL,
+ userspace_timestamp = DUAL_TIMESTAMP_NULL,
+ kernel_timestamp = DUAL_TIMESTAMP_NULL,
+ security_start_timestamp = DUAL_TIMESTAMP_NULL,
+ security_finish_timestamp = DUAL_TIMESTAMP_NULL;
+ struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
+ saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
+ * in. Note we use different values
+ * for the two that indicate whether
+ * these fields are initialized! */
+ bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
+ char *switch_root_dir = NULL, *switch_root_init = NULL;
+ usec_t before_startup, after_startup;
+ static char systemd[] = "systemd";
+ const char *error_message = NULL;
+ int r, retval = EXIT_FAILURE;
+ Manager *m = NULL;
+ FDSet *fds = NULL;
+
+ assert_se(argc > 0 && !isempty(argv[0]));
+
+ /* SysV compatibility: redirect init → telinit */
+ redirect_telinit(argc, argv);
+
+ /* Take timestamps early on */
+ dual_timestamp_from_monotonic(&kernel_timestamp, 0);
+ dual_timestamp_now(&userspace_timestamp);
+
+ /* Figure out whether we need to do initialize the system, or if we already did that because we are
+ * reexecuting. */
+ skip_setup = early_skip_setup_check(argc, argv);
+
+ /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
+ * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
+ * right-away. */
+ program_invocation_short_name = systemd;
+ (void) prctl(PR_SET_NAME, systemd);
+
+ /* Save the original command line */
+ save_argc_argv(argc, argv);
+
+ /* Save the original environment as we might need to restore it if we're requested to execute another
+ * system manager later. */
+ r = save_env();
+ if (r < 0) {
+ error_message = "Failed to copy environment block";
+ goto finish;
+ }
+
+ /* Make sure that if the user says "syslog" we actually log to the journal. */
+ log_set_upgrade_syslog_to_journal(true);
+
+ if (getpid_cached() == 1) {
+ /* When we run as PID 1 force system mode */
+ arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
+
+ /* Disable the umask logic */
+ umask(0);
+
+ /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
+ * not be activated yet (even though the log socket for it exists). */
+ log_set_prohibit_ipc(true);
+
+ /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
+ * is important so that we never end up logging to any foreign stderr, for example if we have
+ * to log in a child process right before execve()'ing the actual binary, at a point in time
+ * where socket activation stderr/stdout area already set up. */
+ log_set_always_reopen_console(true);
+
+ if (detect_container() <= 0) {
+
+ /* Running outside of a container as PID 1 */
+ log_set_target_and_open(LOG_TARGET_KMSG);
+
+ if (in_initrd())
+ initrd_timestamp = userspace_timestamp;
+
+ if (!skip_setup) {
+ r = mount_setup_early();
+ if (r < 0) {
+ error_message = "Failed to mount early API filesystems";
+ goto finish;
+ }
+ }
+
+ /* We might have just mounted /proc, so let's try to parse the kernel
+ * command line log arguments immediately. */
+ log_parse_environment();
+
+ /* Let's open the log backend a second time, in case the first time didn't
+ * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
+ * available, and it previously wasn't. */
+ log_open();
+
+ if (!skip_setup) {
+ disable_printk_ratelimit();
+
+ r = initialize_security(
+ &loaded_policy,
+ &security_start_timestamp,
+ &security_finish_timestamp,
+ &error_message);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (mac_init() < 0) {
+ error_message = "Failed to initialize MAC support";
+ goto finish;
+ }
+
+ if (!skip_setup)
+ initialize_clock();
+
+ /* Set the default for later on, but don't actually open the logs like this for
+ * now. Note that if we are transitioning from the initrd there might still be
+ * journal fd open, and we shouldn't attempt opening that before we parsed
+ * /proc/cmdline which might redirect output elsewhere. */
+ log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
+
+ } else {
+ /* Running inside a container, as PID 1 */
+ log_set_target_and_open(LOG_TARGET_CONSOLE);
+
+ /* For later on, see above... */
+ log_set_target(LOG_TARGET_JOURNAL);
+
+ /* clear the kernel timestamp, because we are in a container */
+ kernel_timestamp = DUAL_TIMESTAMP_NULL;
+ }
+
+ initialize_coredump(skip_setup);
+
+ r = fixup_environment();
+ if (r < 0) {
+ log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to fix up PID 1 environment: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR);
+ error_message = "Failed to fix up PID1 environment";
+ goto finish;
+ }
+
+ /* Try to figure out if we can use colors with the console. No need to do that for user
+ * instances since they never log into the console. */
+ log_show_color(colors_enabled());
+
+ r = make_null_stdio();
+ if (r < 0)
+ log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
+
+ /* Load the kernel modules early. */
+ if (!skip_setup)
+ (void) kmod_setup();
+
+ /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
+ r = mount_setup(loaded_policy, skip_setup);
+ if (r < 0) {
+ error_message = "Failed to mount API filesystems";
+ goto finish;
+ }
+
+ /* The efivarfs is now mounted, let's lock down the system token. */
+ lock_down_efi_variables();
+
+ /* Cache command-line options passed from EFI variables */
+ if (!skip_setup)
+ (void) cache_efi_options_variable();
+ } else {
+ /* Running as user instance */
+ arg_runtime_scope = RUNTIME_SCOPE_USER;
+ log_set_always_reopen_console(true);
+ log_set_target_and_open(LOG_TARGET_AUTO);
+
+ /* clear the kernel timestamp, because we are not PID 1 */
+ kernel_timestamp = DUAL_TIMESTAMP_NULL;
+
+ /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
+ * not affect the permitted and effective sets which are important for the manager itself to
+ * operate. */
+ capability_ambient_set_apply(0, /* also_inherit= */ false);
+
+ if (mac_init() < 0) {
+ error_message = "Failed to initialize MAC support";
+ goto finish;
+ }
+ }
+
+ /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
+ * transitioning from the initrd to the main systemd or suchlike. */
+ save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+ /* Reset all signal handlers. */
+ (void) reset_all_signal_handlers();
+ (void) ignore_signals(SIGNALS_IGNORE);
+
+ (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
+ r = parse_argv(argc, argv);
+ if (r < 0) {
+ error_message = "Failed to parse command line arguments";
+ goto finish;
+ }
+
+ r = safety_checks();
+ if (r < 0)
+ goto finish;
+
+ if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
+ pager_open(arg_pager_flags);
+
+ if (arg_action != ACTION_RUN)
+ skip_setup = true;
+
+ if (arg_action == ACTION_HELP) {
+ retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_VERSION) {
+ retval = version();
+ goto finish;
+ } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
+ unit_dump_config_items(stdout);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
+ dump_bus_properties(stdout);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ } else if (arg_action == ACTION_BUS_INTROSPECT) {
+ r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
+ retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+ goto finish;
+ }
+
+ assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
+
+ /* Move out of the way, so that we won't block unmounts */
+ assert_se(chdir("/") == 0);
+
+ if (arg_action == ACTION_RUN) {
+ if (!skip_setup) {
+ /* Apply the systemd.clock_usec= kernel command line switch */
+ apply_clock_update();
+
+ /* Apply random seed from kernel command line */
+ cmdline_take_random_seed();
+ }
+
+ /* A core pattern might have been specified via the cmdline. */
+ initialize_core_pattern(skip_setup);
+
+ /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
+ log_close();
+
+ /* Remember open file descriptors for later deserialization */
+ r = collect_fds(&fds, &error_message);
+ if (r < 0)
+ goto finish;
+
+ /* Give up any control of the console, but make sure its initialized. */
+ setup_console_terminal(skip_setup);
+
+ /* Open the logging devices, if possible and necessary */
+ log_open();
+ }
+
+ log_execution_mode(&first_boot);
+
+ r = initialize_runtime(skip_setup,
+ first_boot,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ &error_message);
+ if (r < 0)
+ goto finish;
+
+ r = manager_new(arg_runtime_scope,
+ arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
+ &m);
+ if (r < 0) {
+ log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to allocate manager object: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR);
+ error_message = "Failed to allocate manager object";
+ goto finish;
+ }
+
+ m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
+ m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
+ m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
+ m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
+ m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
+
+ set_manager_defaults(m);
+ set_manager_settings(m);
+ manager_set_first_boot(m, first_boot);
+ manager_set_switching_root(m, arg_switched_root);
+
+ /* Remember whether we should queue the default job */
+ queue_default_job = !arg_serialization || arg_switched_root;
+
+ before_startup = now(CLOCK_MONOTONIC);
+
+ r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
+ if (r < 0) {
+ error_message = "Failed to start up manager";
+ goto finish;
+ }
+
+ /* This will close all file descriptors that were opened, but not claimed by any unit. */
+ fds = fdset_free(fds);
+ arg_serialization = safe_fclose(arg_serialization);
+
+ if (queue_default_job) {
+ r = do_queue_default_job(m, &error_message);
+ if (r < 0)
+ goto finish;
+ }
+
+ after_startup = now(CLOCK_MONOTONIC);
+
+ log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
+ "Loaded units and determined initial transaction in %s.",
+ FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
+
+ if (arg_action == ACTION_TEST) {
+ manager_test_summary(m);
+ retval = EXIT_SUCCESS;
+ goto finish;
+ }
+
+ r = invoke_main_loop(m,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ &retval,
+ &fds,
+ &switch_root_dir,
+ &switch_root_init,
+ &error_message);
+ assert(r < 0 || IN_SET(r, MANAGER_EXIT, /* MANAGER_OK is not expected here. */
+ MANAGER_RELOAD,
+ MANAGER_REEXECUTE,
+ MANAGER_REBOOT,
+ MANAGER_SOFT_REBOOT,
+ MANAGER_POWEROFF,
+ MANAGER_HALT,
+ MANAGER_KEXEC,
+ MANAGER_SWITCH_ROOT));
+
+finish:
+ pager_close();
+
+ if (m) {
+ arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
+ arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
+ m = manager_free(m);
+ }
+
+ mac_selinux_finish();
+
+ if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
+ r = do_reexecute(r,
+ argc, argv,
+ &saved_rlimit_nofile,
+ &saved_rlimit_memlock,
+ fds,
+ switch_root_dir,
+ switch_root_init,
+ &error_message); /* This only returns if reexecution failed */
+
+ arg_serialization = safe_fclose(arg_serialization);
+ fds = fdset_free(fds);
+
+ saved_env = strv_free(saved_env);
+
+#if HAVE_VALGRIND_VALGRIND_H
+ /* If we are PID 1 and running under valgrind, then let's exit
+ * here explicitly. valgrind will only generate nice output on
+ * exit(), not on exec(), hence let's do the former not the
+ * latter here. */
+ if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
+ /* Cleanup watchdog_device strings for valgrind. We need them
+ * in become_shutdown() so normally we cannot free them yet. */
+ watchdog_free_device();
+ reset_arguments();
+ return retval;
+ }
+#endif
+
+#if HAS_FEATURE_ADDRESS_SANITIZER
+ /* At this stage we most likely don't have stdio/stderr open, so the following
+ * LSan check would not print any actionable information and would just crash
+ * PID 1. To make this a bit more helpful, let's try to open /dev/console,
+ * and if we succeed redirect LSan's report there. */
+ if (getpid_cached() == 1) {
+ _cleanup_close_ int tty_fd = -EBADF;
+
+ tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (tty_fd >= 0)
+ __sanitizer_set_report_fd((void*) (intptr_t) tty_fd);
+
+ __lsan_do_leak_check();
+ }
+#endif
+
+ if (r < 0)
+ (void) sd_notifyf(0, "ERRNO=%i", -r);
+
+ /* Try to invoke the shutdown binary unless we already failed.
+ * If we failed above, we want to freeze after finishing cleanup. */
+ if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
+ IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) {
+ r = become_shutdown(r, retval);
+ log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
+ error_message = "Failed to execute shutdown binary";
+ }
+
+ /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
+ * a mechanism to pick up systemd's exit status in the VM. */
+ (void) sd_notifyf(0, "EXIT_STATUS=%i", retval);
+
+ watchdog_free_device();
+ arg_watchdog_device = mfree(arg_watchdog_device);
+
+ if (getpid_cached() == 1) {
+ if (error_message)
+ manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
+ ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
+ "%s.", error_message);
+ freeze_or_exit_or_reboot();
+ }
+
+ reset_arguments();
+ return retval;
+}
diff --git a/src/core/main.h b/src/core/main.h
new file mode 100644
index 0000000..b12a1cc
--- /dev/null
+++ b/src/core/main.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+extern bool arg_dump_core;
+extern int arg_crash_chvt;
+extern bool arg_crash_shell;
+extern bool arg_crash_reboot;
diff --git a/src/core/manager-dump.c b/src/core/manager-dump.c
new file mode 100644
index 0000000..6c32d78
--- /dev/null
+++ b/src/core/manager-dump.c
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "build.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hashmap.h"
+#include "manager-dump.h"
+#include "memstream-util.h"
+#include "unit-serialize.h"
+#include "version.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix) {
+ Job *j;
+
+ assert(s);
+ assert(f);
+
+ HASHMAP_FOREACH(j, s->jobs) {
+
+ if (!strv_fnmatch_or_empty(patterns, j->unit->id, FNM_NOESCAPE))
+ continue;
+
+ job_dump(j, f, prefix);
+ }
+}
+
+int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret) {
+ _cleanup_(memstream_done) MemStream ms = {};
+ FILE *f;
+
+ assert(m);
+ assert(ret);
+
+ f = memstream_init(&ms);
+ if (!f)
+ return -errno;
+
+ manager_dump_jobs(m, f, patterns, prefix);
+
+ return memstream_finalize(&ms, ret, NULL);
+}
+
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix) {
+ Unit *u;
+ const char *t;
+
+ assert(s);
+ assert(f);
+
+ HASHMAP_FOREACH_KEY(u, t, s->units) {
+ if (u->id != t)
+ continue;
+
+ if (!strv_fnmatch_or_empty(patterns, u->id, FNM_NOESCAPE))
+ continue;
+
+ unit_dump(u, f, prefix);
+ }
+}
+
+static void manager_dump_header(Manager *m, FILE *f, const char *prefix) {
+
+ /* NB: this is a debug interface for developers. It's not supposed to be machine readable or be
+ * stable between versions. We take the liberty to restructure it entirely between versions and
+ * add/remove fields at will. */
+
+ fprintf(f, "%sManager: systemd " STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")\n", strempty(prefix));
+ fprintf(f, "%sFeatures: %s\n", strempty(prefix), systemd_features);
+
+ for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ const dual_timestamp *t = m->timestamps + q;
+
+ if (dual_timestamp_is_set(t))
+ fprintf(f, "%sTimestamp %s: %s\n",
+ strempty(prefix),
+ manager_timestamp_to_string(q),
+ timestamp_is_set(t->realtime) ? FORMAT_TIMESTAMP(t->realtime) :
+ FORMAT_TIMESPAN(t->monotonic, 1));
+ }
+}
+
+void manager_dump(Manager *m, FILE *f, char **patterns, const char *prefix) {
+ assert(m);
+ assert(f);
+
+ /* If no pattern is provided, dump the full manager state including the manager version, features and
+ * so on. Otherwise limit the dump to the units/jobs matching the specified patterns. */
+ if (!patterns)
+ manager_dump_header(m, f, prefix);
+
+ manager_dump_units(m, f, patterns, prefix);
+ manager_dump_jobs(m, f, patterns, prefix);
+}
+
+int manager_get_dump_string(Manager *m, char **patterns, char **ret) {
+ _cleanup_(memstream_done) MemStream ms = {};
+ FILE *f;
+
+ assert(m);
+ assert(ret);
+
+ f = memstream_init(&ms);
+ if (!f)
+ return -errno;
+
+ manager_dump(m, f, patterns, NULL);
+
+ return memstream_finalize(&ms, ret, NULL);
+}
+
+void manager_test_summary(Manager *m) {
+ assert(m);
+
+ printf("-> By units:\n");
+ manager_dump_units(m, stdout, /* patterns= */ NULL, "\t");
+
+ printf("-> By jobs:\n");
+ manager_dump_jobs(m, stdout, /* patterns= */ NULL, "\t");
+}
diff --git a/src/core/manager-dump.h b/src/core/manager-dump.h
new file mode 100644
index 0000000..5b96f26
--- /dev/null
+++ b/src/core/manager-dump.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "manager.h"
+
+void manager_dump_jobs(Manager *s, FILE *f, char **patterns, const char *prefix);
+int manager_get_dump_jobs_string(Manager *m, char **patterns, const char *prefix, char **ret);
+void manager_dump_units(Manager *s, FILE *f, char **patterns, const char *prefix);
+void manager_dump(Manager *s, FILE *f, char **patterns, const char *prefix);
+int manager_get_dump_string(Manager *m, char **patterns, char **ret);
+void manager_test_summary(Manager *m);
diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c
new file mode 100644
index 0000000..e9d567a
--- /dev/null
+++ b/src/core/manager-serialize.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "clean-ipc.h"
+#include "core-varlink.h"
+#include "dbus.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "initrd-util.h"
+#include "macro.h"
+#include "manager-serialize.h"
+#include "manager.h"
+#include "parse-util.h"
+#include "serialize.h"
+#include "syslog-util.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+#include "varlink-internal.h"
+
+int manager_open_serialization(Manager *m, FILE **ret_f) {
+ assert(ret_f);
+
+ return open_serialization_file("systemd-state", ret_f);
+}
+
+static bool manager_timestamp_shall_serialize(ManagerTimestamp t) {
+ if (!in_initrd())
+ return true;
+
+ /* The following timestamps only apply to the host system, hence only serialize them there */
+ return !IN_SET(t,
+ MANAGER_TIMESTAMP_USERSPACE, MANAGER_TIMESTAMP_FINISH,
+ MANAGER_TIMESTAMP_SECURITY_START, MANAGER_TIMESTAMP_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_GENERATORS_START, MANAGER_TIMESTAMP_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD_START, MANAGER_TIMESTAMP_UNITS_LOAD_FINISH);
+}
+
+static void manager_serialize_uid_refs_internal(
+ FILE *f,
+ Hashmap *uid_refs,
+ const char *field_name) {
+
+ void *p, *k;
+
+ assert(f);
+ assert(field_name);
+
+ /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as
+ * the actual counter of it is better rebuild after a reload/reexec. */
+
+ HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+ uint32_t c;
+ uid_t uid;
+
+ uid = PTR_TO_UID(k);
+ c = PTR_TO_UINT32(p);
+
+ if (!(c & DESTROY_IPC_FLAG))
+ continue;
+
+ (void) serialize_item_format(f, field_name, UID_FMT, uid);
+ }
+}
+
+static void manager_serialize_uid_refs(Manager *m, FILE *f) {
+ manager_serialize_uid_refs_internal(f, m->uid_refs, "destroy-ipc-uid");
+}
+
+static void manager_serialize_gid_refs(Manager *m, FILE *f) {
+ manager_serialize_uid_refs_internal(f, m->gid_refs, "destroy-ipc-gid");
+}
+
+int manager_serialize(
+ Manager *m,
+ FILE *f,
+ FDSet *fds,
+ bool switching_root) {
+
+ const char *t;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+ (void) serialize_item_format(f, "current-job-id", "%" PRIu32, m->current_job_id);
+ (void) serialize_item_format(f, "n-installed-jobs", "%u", m->n_installed_jobs);
+ (void) serialize_item_format(f, "n-failed-jobs", "%u", m->n_failed_jobs);
+ (void) serialize_bool(f, "ready-sent", m->ready_sent);
+ (void) serialize_bool(f, "taint-logged", m->taint_logged);
+ (void) serialize_bool(f, "service-watchdogs", m->service_watchdogs);
+
+ if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+ (void) serialize_item(f, "show-status-overridden",
+ show_status_to_string(m->show_status_overridden));
+
+ if (m->log_level_overridden)
+ (void) serialize_item_format(f, "log-level-override", "%i", log_get_max_level());
+ if (m->log_target_overridden)
+ (void) serialize_item(f, "log-target-override", log_target_to_string(log_get_target()));
+
+ (void) serialize_usec(f, "runtime-watchdog-overridden", m->watchdog_overridden[WATCHDOG_RUNTIME]);
+ (void) serialize_usec(f, "reboot-watchdog-overridden", m->watchdog_overridden[WATCHDOG_REBOOT]);
+ (void) serialize_usec(f, "kexec-watchdog-overridden", m->watchdog_overridden[WATCHDOG_KEXEC]);
+ (void) serialize_usec(f, "pretimeout-watchdog-overridden", m->watchdog_overridden[WATCHDOG_PRETIMEOUT]);
+ (void) serialize_item(f, "pretimeout-watchdog-governor-overridden", m->watchdog_pretimeout_governor_overridden);
+
+ for (ManagerTimestamp q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ _cleanup_free_ char *joined = NULL;
+
+ if (!manager_timestamp_shall_serialize(q))
+ continue;
+
+ joined = strjoin(manager_timestamp_to_string(q), "-timestamp");
+ if (!joined)
+ return log_oom();
+
+ (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
+ }
+
+ if (!switching_root)
+ (void) serialize_strv(f, "env", m->client_environment);
+
+ if (m->notify_fd >= 0) {
+ r = serialize_fd(f, fds, "notify-fd", m->notify_fd);
+ if (r < 0)
+ return r;
+
+ (void) serialize_item(f, "notify-socket", m->notify_socket);
+ }
+
+ if (m->cgroups_agent_fd >= 0) {
+ r = serialize_fd(f, fds, "cgroups-agent-fd", m->cgroups_agent_fd);
+ if (r < 0)
+ return r;
+ }
+
+ if (m->user_lookup_fds[0] >= 0) {
+ int copy0, copy1;
+
+ copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]);
+ if (copy0 < 0)
+ return log_error_errno(copy0, "Failed to add user lookup fd to serialization: %m");
+
+ copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]);
+ if (copy1 < 0)
+ return log_error_errno(copy1, "Failed to add user lookup fd to serialization: %m");
+
+ (void) serialize_item_format(f, "user-lookup", "%i %i", copy0, copy1);
+ }
+
+ (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit);
+
+ bus_track_serialize(m->subscribed, f, "subscribed");
+
+ r = dynamic_user_serialize(m, f, fds);
+ if (r < 0)
+ return r;
+
+ manager_serialize_uid_refs(m, f);
+ manager_serialize_gid_refs(m, f);
+
+ r = exec_shared_runtime_serialize(m, f, fds);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_serialize(m->varlink_server, f, fds);
+ if (r < 0)
+ return r;
+
+ (void) fputc('\n', f);
+
+ HASHMAP_FOREACH_KEY(u, t, m->units) {
+ if (u->id != t)
+ continue;
+
+ r = unit_serialize_state(u, f, fds, switching_root);
+ if (r < 0)
+ return r;
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to flush serialization: %m");
+
+ r = bus_fdset_add_all(m, fds);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add bus sockets to serialization: %m");
+
+ return 0;
+}
+
+static int manager_deserialize_one_unit(Manager *m, const char *name, FILE *f, FDSet *fds) {
+ Unit *u;
+ int r;
+
+ r = manager_load_unit(m, name, NULL, NULL, &u);
+ if (r < 0) {
+ if (r == -ENOMEM)
+ return r;
+ return log_notice_errno(r, "Failed to load unit \"%s\", skipping deserialization: %m", name);
+ }
+
+ r = unit_deserialize_state(u, f, fds);
+ if (r < 0) {
+ if (r == -ENOMEM)
+ return r;
+ return log_notice_errno(r, "Failed to deserialize unit \"%s\", skipping: %m", name);
+ }
+
+ return 0;
+}
+
+static int manager_deserialize_units(Manager *m, FILE *f, FDSet *fds) {
+ int r;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ /* Start marker */
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ break;
+
+ r = manager_deserialize_one_unit(m, line, f, fds);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ r = unit_deserialize_state_skip(f);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static void manager_deserialize_uid_refs_one_internal(
+ Hashmap** uid_refs,
+ const char *value) {
+
+ uid_t uid;
+ uint32_t c;
+ int r;
+
+ assert(uid_refs);
+ assert(value);
+
+ r = parse_uid(value, &uid);
+ if (r < 0 || uid == 0) {
+ log_debug("Unable to parse UID/GID reference serialization: %s", value);
+ return;
+ }
+
+ if (hashmap_ensure_allocated(uid_refs, &trivial_hash_ops) < 0) {
+ log_oom();
+ return;
+ }
+
+ c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+ if (c & DESTROY_IPC_FLAG)
+ return;
+
+ c |= DESTROY_IPC_FLAG;
+
+ r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add UID/GID reference entry: %m");
+ return;
+ }
+}
+
+static void manager_deserialize_uid_refs_one(Manager *m, const char *value) {
+ manager_deserialize_uid_refs_one_internal(&m->uid_refs, value);
+}
+
+static void manager_deserialize_gid_refs_one(Manager *m, const char *value) {
+ manager_deserialize_uid_refs_one_internal(&m->gid_refs, value);
+}
+
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
+ bool deserialize_varlink_sockets = false;
+ int r = 0;
+
+ assert(m);
+ assert(f);
+
+ if (DEBUG_LOGGING) {
+ if (fdset_isempty(fds))
+ log_debug("No file descriptors passed");
+ else {
+ int fd;
+
+ FDSET_FOREACH(fd, fds) {
+ _cleanup_free_ char *fn = NULL;
+
+ r = fd_get_path(fd, &fn);
+ if (r < 0)
+ log_debug_errno(r, "Received serialized fd %i %s %m",
+ fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT));
+ else
+ log_debug("Received serialized fd %i %s %s",
+ fd, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), strna(fn));
+ }
+ }
+ }
+
+ log_debug("Deserializing state...");
+
+ /* If we are not in reload mode yet, enter it now. Not that this is recursive, a caller might already have
+ * increased it to non-zero, which is why we just increase it by one here and down again at the end of this
+ * call. */
+ _cleanup_(manager_reloading_stopp) _unused_ Manager *reloading = manager_reloading_start(m);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ const char *val;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ if ((val = startswith(l, "current-job-id="))) {
+ uint32_t id;
+
+ if (safe_atou32(val, &id) < 0)
+ log_notice("Failed to parse current job id value '%s', ignoring.", val);
+ else
+ m->current_job_id = MAX(m->current_job_id, id);
+
+ } else if ((val = startswith(l, "n-installed-jobs="))) {
+ uint32_t n;
+
+ if (safe_atou32(val, &n) < 0)
+ log_notice("Failed to parse installed jobs counter '%s', ignoring.", val);
+ else
+ m->n_installed_jobs += n;
+
+ } else if ((val = startswith(l, "n-failed-jobs="))) {
+ uint32_t n;
+
+ if (safe_atou32(val, &n) < 0)
+ log_notice("Failed to parse failed jobs counter '%s', ignoring.", val);
+ else
+ m->n_failed_jobs += n;
+
+ } else if ((val = startswith(l, "ready-sent="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse ready-sent flag '%s', ignoring.", val);
+ else
+ m->ready_sent = m->ready_sent || b;
+
+ } else if ((val = startswith(l, "taint-logged="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse taint-logged flag '%s', ignoring.", val);
+ else
+ m->taint_logged = m->taint_logged || b;
+
+ } else if ((val = startswith(l, "service-watchdogs="))) {
+ int b;
+
+ b = parse_boolean(val);
+ if (b < 0)
+ log_notice("Failed to parse service-watchdogs flag '%s', ignoring.", val);
+ else
+ m->service_watchdogs = b;
+
+ } else if ((val = startswith(l, "show-status-overridden="))) {
+ ShowStatus s;
+
+ s = show_status_from_string(val);
+ if (s < 0)
+ log_notice("Failed to parse show-status-overridden flag '%s', ignoring.", val);
+ else
+ manager_override_show_status(m, s, "deserialize");
+
+ } else if ((val = startswith(l, "log-level-override="))) {
+ int level;
+
+ level = log_level_from_string(val);
+ if (level < 0)
+ log_notice("Failed to parse log-level-override value '%s', ignoring.", val);
+ else
+ manager_override_log_level(m, level);
+
+ } else if ((val = startswith(l, "log-target-override="))) {
+ LogTarget target;
+
+ target = log_target_from_string(val);
+ if (target < 0)
+ log_notice("Failed to parse log-target-override value '%s', ignoring.", val);
+ else
+ manager_override_log_target(m, target);
+
+ } else if ((val = startswith(l, "runtime-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse runtime-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_RUNTIME, t);
+
+ } else if ((val = startswith(l, "reboot-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse reboot-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_REBOOT, t);
+
+ } else if ((val = startswith(l, "kexec-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse kexec-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_KEXEC, t);
+
+ } else if ((val = startswith(l, "pretimeout-watchdog-overridden="))) {
+ usec_t t;
+
+ if (deserialize_usec(val, &t) < 0)
+ log_notice("Failed to parse pretimeout-watchdog-overridden value '%s', ignoring.", val);
+ else
+ manager_override_watchdog(m, WATCHDOG_PRETIMEOUT, t);
+
+ } else if ((val = startswith(l, "pretimeout-watchdog-governor-overridden="))) {
+ r = free_and_strdup(&m->watchdog_pretimeout_governor_overridden, val);
+ if (r < 0)
+ return r;
+
+ } else if (startswith(l, "env=")) {
+ r = deserialize_environment(l + 4, &m->client_environment);
+ if (r < 0)
+ log_notice_errno(r, "Failed to parse environment entry: \"%s\", ignoring: %m", l);
+
+ } else if ((val = startswith(l, "notify-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd >= 0) {
+ m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+ safe_close(m->notify_fd);
+ m->notify_fd = fd;
+ }
+
+ } else if ((val = startswith(l, "notify-socket="))) {
+ r = free_and_strdup(&m->notify_socket, val);
+ if (r < 0)
+ return r;
+
+ } else if ((val = startswith(l, "cgroups-agent-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd >= 0) {
+ m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+ safe_close(m->cgroups_agent_fd);
+ m->cgroups_agent_fd = fd;
+ }
+
+ } else if ((val = startswith(l, "user-lookup="))) {
+ int fd0, fd1;
+
+ if (sscanf(val, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1))
+ log_notice("Failed to parse user lookup fd, ignoring: %s", val);
+ else {
+ m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+ safe_close_pair(m->user_lookup_fds);
+ m->user_lookup_fds[0] = fdset_remove(fds, fd0);
+ m->user_lookup_fds[1] = fdset_remove(fds, fd1);
+ }
+
+ } else if ((val = startswith(l, "dynamic-user=")))
+ dynamic_user_deserialize_one(m, val, fds, NULL);
+ else if ((val = startswith(l, "destroy-ipc-uid=")))
+ manager_deserialize_uid_refs_one(m, val);
+ else if ((val = startswith(l, "destroy-ipc-gid=")))
+ manager_deserialize_gid_refs_one(m, val);
+ else if ((val = startswith(l, "exec-runtime=")))
+ (void) exec_shared_runtime_deserialize_one(m, val, fds);
+ else if ((val = startswith(l, "subscribed="))) {
+
+ if (strv_extend(&m->deserialized_subscribed, val) < 0)
+ return -ENOMEM;
+ } else if ((val = startswith(l, "varlink-server-socket-address="))) {
+ if (!m->varlink_server && MANAGER_IS_SYSTEM(m)) {
+ r = manager_varlink_init(m);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to setup varlink server, ignoring: %m");
+ continue;
+ }
+
+ deserialize_varlink_sockets = true;
+ }
+
+ /* To avoid unnecessary deserialization (i.e. during reload vs. reexec) we only deserialize
+ * the FDs if we had to create a new m->varlink_server. The deserialize_varlink_sockets flag
+ * is initialized outside of the loop, is flipped after the VarlinkServer is setup, and
+ * remains set until all serialized contents are handled. */
+ if (deserialize_varlink_sockets)
+ (void) varlink_server_deserialize_one(m->varlink_server, val, fds);
+ } else if ((val = startswith(l, "dump-ratelimit=")))
+ deserialize_ratelimit(&m->dump_ratelimit, "dump-ratelimit", val);
+ else {
+ ManagerTimestamp q;
+
+ for (q = 0; q < _MANAGER_TIMESTAMP_MAX; q++) {
+ val = startswith(l, manager_timestamp_to_string(q));
+ if (!val)
+ continue;
+
+ val = startswith(val, "-timestamp=");
+ if (val)
+ break;
+ }
+
+ if (q < _MANAGER_TIMESTAMP_MAX) /* found it */
+ (void) deserialize_dual_timestamp(val, m->timestamps + q);
+ else if (!STARTSWITH_SET(l, "kdbus-fd=", "honor-device-enumeration=")) /* ignore deprecated values */
+ log_notice("Unknown serialization item '%s', ignoring.", l);
+ }
+ }
+
+ return manager_deserialize_units(m, f, fds);
+}
diff --git a/src/core/manager-serialize.h b/src/core/manager-serialize.h
new file mode 100644
index 0000000..c52261e
--- /dev/null
+++ b/src/core/manager-serialize.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "manager.h"
+#include "fdset.h"
+
+#define DESTROY_IPC_FLAG (UINT32_C(1) << 31)
+
+int manager_open_serialization(Manager *m, FILE **ret_f);
+int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root);
+int manager_deserialize(Manager *m, FILE *f, FDSet *fds);
diff --git a/src/core/manager.c b/src/core/manager.c
new file mode 100644
index 0000000..88eebfc
--- /dev/null
+++ b/src/core/manager.c
@@ -0,0 +1,5039 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kd.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/reboot.h>
+#include <sys/timerfd.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#if HAVE_AUDIT
+#include <libaudit.h>
+#endif
+
+#include "sd-daemon.h"
+#include "sd-messages.h"
+#include "sd-path.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "boot-timestamps.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "clean-ipc.h"
+#include "clock-util.h"
+#include "common-signal.h"
+#include "confidential-virt.h"
+#include "constants.h"
+#include "core-varlink.h"
+#include "creds-util.h"
+#include "dbus-job.h"
+#include "dbus-manager.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "dirent-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "event-util.h"
+#include "exec-util.h"
+#include "execute.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "generator-setup.h"
+#include "hashmap.h"
+#include "initrd-util.h"
+#include "inotify-util.h"
+#include "install.h"
+#include "io-util.h"
+#include "label-util.h"
+#include "load-fragment.h"
+#include "locale-setup.h"
+#include "log.h"
+#include "macro.h"
+#include "manager.h"
+#include "manager-dump.h"
+#include "manager-serialize.h"
+#include "memory-util.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "plymouth-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "ratelimit.h"
+#include "rlimit-util.h"
+#include "rm-rf.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "strxcpyx.h"
+#include "sysctl-util.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "transaction.h"
+#include "uid-range.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "user-util.h"
+#include "virt.h"
+#include "watchdog.h"
+
+#define NOTIFY_RCVBUF_SIZE (8*1024*1024)
+#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024)
+
+/* Initial delay and the interval for printing status messages about running jobs */
+#define JOBS_IN_PROGRESS_WAIT_USEC (2*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_QUIET_WAIT_USEC (25*USEC_PER_SEC)
+#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3)
+#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3
+
+/* If there are more than 1K bus messages queue across our API and direct buses, then let's not add more on top until
+ * the queue gets more empty. */
+#define MANAGER_BUS_BUSY_THRESHOLD 1024LU
+
+/* How many units and jobs to process of the bus queue before returning to the event loop. */
+#define MANAGER_BUS_MESSAGE_BUDGET 100U
+
+#define DEFAULT_TASKS_MAX ((CGroupTasksMax) { 15U, 100U }) /* 15% */
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
+static int manager_dispatch_timezone_change(sd_event_source *source, const struct inotify_event *event, void *userdata);
+static int manager_run_environment_generators(Manager *m);
+static int manager_run_generators(Manager *m);
+static void manager_vacuum(Manager *m);
+
+static usec_t manager_watch_jobs_next_time(Manager *m) {
+ usec_t timeout;
+
+ if (MANAGER_IS_USER(m))
+ /* Let the user manager without a timeout show status quickly, so the system manager can make
+ * use of it, if it wants to. */
+ timeout = JOBS_IN_PROGRESS_WAIT_USEC * 2 / 3;
+ else if (show_status_on(m->show_status))
+ /* When status is on, just use the usual timeout. */
+ timeout = JOBS_IN_PROGRESS_WAIT_USEC;
+ else
+ timeout = JOBS_IN_PROGRESS_QUIET_WAIT_USEC;
+
+ return usec_add(now(CLOCK_MONOTONIC), timeout);
+}
+
+static bool manager_is_confirm_spawn_disabled(Manager *m) {
+ assert(m);
+
+ if (!m->confirm_spawn)
+ return true;
+
+ return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
+}
+
+static void manager_watch_jobs_in_progress(Manager *m) {
+ usec_t next;
+ int r;
+
+ assert(m);
+
+ /* We do not want to show the cylon animation if the user
+ * needs to confirm service executions otherwise confirmation
+ * messages will be screwed by the cylon animation. */
+ if (!manager_is_confirm_spawn_disabled(m))
+ return;
+
+ if (m->jobs_in_progress_event_source)
+ return;
+
+ next = manager_watch_jobs_next_time(m);
+ r = sd_event_add_time(
+ m->event,
+ &m->jobs_in_progress_event_source,
+ CLOCK_MONOTONIC,
+ next, 0,
+ manager_dispatch_jobs_in_progress, m);
+ if (r < 0)
+ return;
+
+ (void) sd_event_source_set_description(m->jobs_in_progress_event_source, "manager-jobs-in-progress");
+}
+
+static void manager_flip_auto_status(Manager *m, bool enable, const char *reason) {
+ assert(m);
+
+ if (enable) {
+ if (m->show_status == SHOW_STATUS_AUTO)
+ manager_set_show_status(m, SHOW_STATUS_TEMPORARY, reason);
+ } else {
+ if (m->show_status == SHOW_STATUS_TEMPORARY)
+ manager_set_show_status(m, SHOW_STATUS_AUTO, reason);
+ }
+}
+
+static void manager_print_jobs_in_progress(Manager *m) {
+ Job *j;
+ unsigned counter = 0, print_nr;
+ char cylon[6 + CYLON_BUFFER_EXTRA + 1];
+ unsigned cylon_pos;
+ uint64_t timeout = 0;
+
+ assert(m);
+ assert(m->n_running_jobs > 0);
+
+ manager_flip_auto_status(m, true, "delay");
+
+ print_nr = (m->jobs_in_progress_iteration / JOBS_IN_PROGRESS_PERIOD_DIVISOR) % m->n_running_jobs;
+
+ HASHMAP_FOREACH(j, m->jobs)
+ if (j->state == JOB_RUNNING && counter++ == print_nr)
+ break;
+
+ /* m->n_running_jobs must be consistent with the contents of m->jobs,
+ * so the above loop must have succeeded in finding j. */
+ assert(counter == print_nr + 1);
+ assert(j);
+
+ cylon_pos = m->jobs_in_progress_iteration % 14;
+ if (cylon_pos >= 8)
+ cylon_pos = 14 - cylon_pos;
+ draw_cylon(cylon, sizeof(cylon), 6, cylon_pos);
+
+ m->jobs_in_progress_iteration++;
+
+ char job_of_n[STRLEN("( of ) ") + DECIMAL_STR_MAX(unsigned)*2] = "";
+ if (m->n_running_jobs > 1)
+ xsprintf(job_of_n, "(%u of %u) ", counter, m->n_running_jobs);
+
+ (void) job_get_timeout(j, &timeout);
+
+ /* We want to use enough information for the user to identify previous lines talking about the same
+ * unit, but keep the message as short as possible. So if 'Starting foo.service' or 'Starting
+ * foo.service - Description' were used, 'foo.service' is enough here. On the other hand, if we used
+ * 'Starting Description' before, then we shall also use 'Description' here. So we pass NULL as the
+ * second argument to unit_status_string(). */
+ const char *ident = unit_status_string(j->unit, NULL);
+
+ const char *time = FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - j->begin_usec, 1*USEC_PER_SEC);
+ const char *limit = timeout > 0 ? FORMAT_TIMESPAN(timeout - j->begin_usec, 1*USEC_PER_SEC) : "no limit";
+
+ if (m->status_unit_format == STATUS_UNIT_FORMAT_DESCRIPTION)
+ /* When using 'Description', we effectively don't have enough space to show the nested status
+ * without ellipsization, so let's not even try. */
+ manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+ "%sA %s job is running for %s (%s / %s)",
+ job_of_n,
+ job_type_to_string(j->type),
+ ident,
+ time, limit);
+ else {
+ const char *status_text = unit_status_text(j->unit);
+
+ manager_status_printf(m, STATUS_TYPE_EPHEMERAL, cylon,
+ "%sJob %s/%s running (%s / %s)%s%s",
+ job_of_n,
+ ident,
+ job_type_to_string(j->type),
+ time, limit,
+ status_text ? ": " : "",
+ strempty(status_text));
+ }
+
+ sd_notifyf(false,
+ "STATUS=%sUser job %s/%s running (%s / %s)...",
+ job_of_n,
+ ident,
+ job_type_to_string(j->type),
+ time, limit);
+ m->status_ready = false;
+}
+
+static int have_ask_password(void) {
+ _cleanup_closedir_ DIR *dir = NULL;
+
+ dir = opendir("/run/systemd/ask-password");
+ if (!dir) {
+ if (errno == ENOENT)
+ return false;
+ else
+ return -errno;
+ }
+
+ FOREACH_DIRENT_ALL(de, dir, return -errno)
+ if (startswith(de->d_name, "ask."))
+ return true;
+ return false;
+}
+
+static int manager_dispatch_ask_password_fd(sd_event_source *source,
+ int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ (void) flush_fd(fd);
+
+ m->have_ask_password = have_ask_password();
+ if (m->have_ask_password < 0)
+ /* Log error but continue. Negative have_ask_password
+ * is treated as unknown status. */
+ log_error_errno(m->have_ask_password, "Failed to list /run/systemd/ask-password: %m");
+
+ return 0;
+}
+
+static void manager_close_ask_password(Manager *m) {
+ assert(m);
+
+ m->ask_password_event_source = sd_event_source_disable_unref(m->ask_password_event_source);
+ m->ask_password_inotify_fd = safe_close(m->ask_password_inotify_fd);
+ m->have_ask_password = -EINVAL;
+}
+
+static int manager_check_ask_password(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->ask_password_event_source) {
+ assert(m->ask_password_inotify_fd < 0);
+
+ (void) mkdir_p_label("/run/systemd/ask-password", 0755);
+
+ m->ask_password_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (m->ask_password_inotify_fd < 0)
+ return log_error_errno(errno, "Failed to create inotify object: %m");
+
+ r = inotify_add_watch_and_warn(m->ask_password_inotify_fd,
+ "/run/systemd/ask-password",
+ IN_CREATE|IN_DELETE|IN_MOVE);
+ if (r < 0) {
+ manager_close_ask_password(m);
+ return r;
+ }
+
+ r = sd_event_add_io(m->event, &m->ask_password_event_source,
+ m->ask_password_inotify_fd, EPOLLIN,
+ manager_dispatch_ask_password_fd, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add event source for /run/systemd/ask-password: %m");
+ manager_close_ask_password(m);
+ return r;
+ }
+
+ (void) sd_event_source_set_description(m->ask_password_event_source, "manager-ask-password");
+
+ /* Queries might have been added meanwhile... */
+ manager_dispatch_ask_password_fd(m->ask_password_event_source,
+ m->ask_password_inotify_fd, EPOLLIN, m);
+ }
+
+ return m->have_ask_password;
+}
+
+static int manager_watch_idle_pipe(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (m->idle_pipe_event_source)
+ return 0;
+
+ if (m->idle_pipe[2] < 0)
+ return 0;
+
+ r = sd_event_add_io(m->event, &m->idle_pipe_event_source, m->idle_pipe[2], EPOLLIN, manager_dispatch_idle_pipe_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to watch idle pipe: %m");
+
+ (void) sd_event_source_set_description(m->idle_pipe_event_source, "manager-idle-pipe");
+
+ return 0;
+}
+
+static void manager_close_idle_pipe(Manager *m) {
+ assert(m);
+
+ m->idle_pipe_event_source = sd_event_source_disable_unref(m->idle_pipe_event_source);
+
+ safe_close_pair(m->idle_pipe);
+ safe_close_pair(m->idle_pipe + 2);
+}
+
+static int manager_setup_time_change(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ m->time_change_event_source = sd_event_source_disable_unref(m->time_change_event_source);
+
+ r = event_add_time_change(m->event, &m->time_change_event_source, manager_dispatch_time_change_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create time change event source: %m");
+
+ /* Schedule this slightly earlier than the .timer event sources */
+ r = sd_event_source_set_priority(m->time_change_event_source, SD_EVENT_PRIORITY_NORMAL-1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of time change event sources: %m");
+
+ log_debug("Set up TFD_TIMER_CANCEL_ON_SET timerfd.");
+
+ return 0;
+}
+
+static int manager_read_timezone_stat(Manager *m) {
+ struct stat st;
+ bool changed;
+
+ assert(m);
+
+ /* Read the current stat() data of /etc/localtime so that we detect changes */
+ if (lstat("/etc/localtime", &st) < 0) {
+ log_debug_errno(errno, "Failed to stat /etc/localtime, ignoring: %m");
+ changed = m->etc_localtime_accessible;
+ m->etc_localtime_accessible = false;
+ } else {
+ usec_t k;
+
+ k = timespec_load(&st.st_mtim);
+ changed = !m->etc_localtime_accessible || k != m->etc_localtime_mtime;
+
+ m->etc_localtime_mtime = k;
+ m->etc_localtime_accessible = true;
+ }
+
+ return changed;
+}
+
+static int manager_setup_timezone_change(Manager *m) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *new_event = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ /* We watch /etc/localtime for three events: change of the link count (which might mean removal from /etc even
+ * though another link might be kept), renames, and file close operations after writing. Note we don't bother
+ * with IN_DELETE_SELF, as that would just report when the inode is removed entirely, i.e. after the link count
+ * went to zero and all fds to it are closed.
+ *
+ * Note that we never follow symlinks here. This is a simplification, but should cover almost all cases
+ * correctly.
+ *
+ * Note that we create the new event source first here, before releasing the old one. This should optimize
+ * behaviour as this way sd-event can reuse the old watch in case the inode didn't change. */
+
+ r = sd_event_add_inotify(m->event, &new_event, "/etc/localtime",
+ IN_ATTRIB|IN_MOVE_SELF|IN_CLOSE_WRITE|IN_DONT_FOLLOW, manager_dispatch_timezone_change, m);
+ if (r == -ENOENT) {
+ /* If the file doesn't exist yet, subscribe to /etc instead, and wait until it is created either by
+ * O_CREATE or by rename() */
+
+ log_debug_errno(r, "/etc/localtime doesn't exist yet, watching /etc instead.");
+ r = sd_event_add_inotify(m->event, &new_event, "/etc",
+ IN_CREATE|IN_MOVED_TO|IN_ONLYDIR, manager_dispatch_timezone_change, m);
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to create timezone change event source: %m");
+
+ /* Schedule this slightly earlier than the .timer event sources */
+ r = sd_event_source_set_priority(new_event, SD_EVENT_PRIORITY_NORMAL-1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of timezone change event sources: %m");
+
+ sd_event_source_unref(m->timezone_change_event_source);
+ m->timezone_change_event_source = TAKE_PTR(new_event);
+
+ return 0;
+}
+
+static int enable_special_signals(Manager *m) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ /* Enable that we get SIGINT on control-alt-del. In containers
+ * this will fail with EPERM (older) or EINVAL (newer), so
+ * ignore that. */
+ if (reboot(RB_DISABLE_CAD) < 0 && !IN_SET(errno, EPERM, EINVAL))
+ log_warning_errno(errno, "Failed to enable ctrl-alt-del handling: %m");
+
+ fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0) {
+ /* Support systems without virtual console */
+ if (fd != -ENOENT)
+ log_warning_errno(errno, "Failed to open /dev/tty0: %m");
+ } else {
+ /* Enable that we get SIGWINCH on kbrequest */
+ if (ioctl(fd, KDSIGACCEPT, SIGWINCH) < 0)
+ log_warning_errno(errno, "Failed to enable kbrequest handling: %m");
+ }
+
+ return 0;
+}
+
+#define RTSIG_IF_AVAILABLE(signum) (signum <= SIGRTMAX ? signum : -1)
+
+static int manager_setup_signals(Manager *m) {
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_NOCLDSTOP|SA_RESTART,
+ };
+ sigset_t mask;
+ int r;
+
+ assert(m);
+
+ assert_se(sigaction(SIGCHLD, &sa, NULL) == 0);
+
+ /* We make liberal use of realtime signals here. On
+ * Linux/glibc we have 30 of them (with the exception of Linux
+ * on hppa, see below), between SIGRTMIN+0 ... SIGRTMIN+30
+ * (aka SIGRTMAX). */
+
+ assert_se(sigemptyset(&mask) == 0);
+ sigset_add_many(&mask,
+ SIGCHLD, /* Child died */
+ SIGTERM, /* Reexecute daemon */
+ SIGHUP, /* Reload configuration */
+ SIGUSR1, /* systemd: reconnect to D-Bus */
+ SIGUSR2, /* systemd: dump status */
+ SIGINT, /* Kernel sends us this on control-alt-del */
+ SIGWINCH, /* Kernel sends us this on kbrequest (alt-arrowup) */
+ SIGPWR, /* Some kernel drivers and upsd send us this on power failure */
+
+ SIGRTMIN+0, /* systemd: start default.target */
+ SIGRTMIN+1, /* systemd: isolate rescue.target */
+ SIGRTMIN+2, /* systemd: isolate emergency.target */
+ SIGRTMIN+3, /* systemd: start halt.target */
+ SIGRTMIN+4, /* systemd: start poweroff.target */
+ SIGRTMIN+5, /* systemd: start reboot.target */
+ SIGRTMIN+6, /* systemd: start kexec.target */
+ SIGRTMIN+7, /* systemd: start soft-reboot.target */
+
+ /* ... space for more special targets ... */
+
+ SIGRTMIN+13, /* systemd: Immediate halt */
+ SIGRTMIN+14, /* systemd: Immediate poweroff */
+ SIGRTMIN+15, /* systemd: Immediate reboot */
+ SIGRTMIN+16, /* systemd: Immediate kexec */
+ SIGRTMIN+17, /* systemd: Immediate soft-reboot */
+ SIGRTMIN+18, /* systemd: control command */
+
+ /* ... space ... */
+
+ SIGRTMIN+20, /* systemd: enable status messages */
+ SIGRTMIN+21, /* systemd: disable status messages */
+ SIGRTMIN+22, /* systemd: set log level to LOG_DEBUG */
+ SIGRTMIN+23, /* systemd: set log level to LOG_INFO */
+ SIGRTMIN+24, /* systemd: Immediate exit (--user only) */
+ SIGRTMIN+25, /* systemd: reexecute manager */
+
+ /* Apparently Linux on hppa had fewer RT signals until v3.18,
+ * SIGRTMAX was SIGRTMIN+25, and then SIGRTMIN was lowered,
+ * see commit v3.17-7614-g1f25df2eff.
+ *
+ * We cannot unconditionally make use of those signals here,
+ * so let's use a runtime check. Since these commands are
+ * accessible by different means and only really a safety
+ * net, the missing functionality on hppa shouldn't matter.
+ */
+
+ RTSIG_IF_AVAILABLE(SIGRTMIN+26), /* systemd: set log target to journal-or-kmsg */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+27), /* systemd: set log target to console */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+28), /* systemd: set log target to kmsg */
+ RTSIG_IF_AVAILABLE(SIGRTMIN+29), /* systemd: set log target to syslog-or-kmsg (obsolete) */
+
+ /* ... one free signal here SIGRTMIN+30 ... */
+ -1);
+ assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+
+ m->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
+ if (m->signal_fd < 0)
+ return -errno;
+
+ r = sd_event_add_io(m->event, &m->signal_event_source, m->signal_fd, EPOLLIN, manager_dispatch_signal_fd, m);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->signal_event_source, "manager-signal");
+
+ /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the
+ * notify processing can still figure out to which process/service a message belongs, before we reap the
+ * process. Also, process this before handling cgroup notifications, so that we always collect child exit
+ * status information before detecting that there's no process in a cgroup. */
+ r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(m))
+ return enable_special_signals(m);
+
+ return 0;
+}
+
+static char** sanitize_environment(char **l) {
+
+ /* Let's remove some environment variables that we need ourselves to communicate with our clients */
+ strv_env_unset_many(
+ l,
+ "CACHE_DIRECTORY",
+ "CONFIGURATION_DIRECTORY",
+ "CREDENTIALS_DIRECTORY",
+ "EXIT_CODE",
+ "EXIT_STATUS",
+ "INVOCATION_ID",
+ "JOURNAL_STREAM",
+ "LISTEN_FDNAMES",
+ "LISTEN_FDS",
+ "LISTEN_PID",
+ "LOGS_DIRECTORY",
+ "LOG_NAMESPACE",
+ "MAINPID",
+ "MANAGERPID",
+ "MEMORY_PRESSURE_WATCH",
+ "MEMORY_PRESSURE_WRITE",
+ "MONITOR_EXIT_CODE",
+ "MONITOR_EXIT_STATUS",
+ "MONITOR_INVOCATION_ID",
+ "MONITOR_SERVICE_RESULT",
+ "MONITOR_UNIT",
+ "NOTIFY_SOCKET",
+ "PIDFILE",
+ "REMOTE_ADDR",
+ "REMOTE_PORT",
+ "RUNTIME_DIRECTORY",
+ "SERVICE_RESULT",
+ "STATE_DIRECTORY",
+ "SYSTEMD_EXEC_PID",
+ "TRIGGER_PATH",
+ "TRIGGER_TIMER_MONOTONIC_USEC",
+ "TRIGGER_TIMER_REALTIME_USEC",
+ "TRIGGER_UNIT",
+ "WATCHDOG_PID",
+ "WATCHDOG_USEC",
+ NULL);
+
+ /* Let's order the environment alphabetically, just to make it pretty */
+ return strv_sort(l);
+}
+
+int manager_default_environment(Manager *m) {
+ int r;
+
+ assert(m);
+
+ m->transient_environment = strv_free(m->transient_environment);
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* The system manager always starts with a clean environment for its children. It does not
+ * import the kernel's or the parents' exported variables.
+ *
+ * The initial passed environment is untouched to keep /proc/self/environ valid; it is used
+ * for tagging the init process inside containers. */
+ m->transient_environment = strv_new("PATH=" DEFAULT_PATH);
+ if (!m->transient_environment)
+ return log_oom();
+
+ /* Import locale variables LC_*= from configuration */
+ (void) locale_setup(&m->transient_environment);
+ } else {
+ /* The user manager passes its own environment along to its children, except for $PATH. */
+ m->transient_environment = strv_copy(environ);
+ if (!m->transient_environment)
+ return log_oom();
+
+ r = strv_env_replace_strdup(&m->transient_environment, "PATH=" DEFAULT_USER_PATH);
+ if (r < 0)
+ return log_oom();
+ }
+
+ sanitize_environment(m->transient_environment);
+ return 0;
+}
+
+static int manager_setup_prefix(Manager *m) {
+ struct table_entry {
+ uint64_t type;
+ const char *suffix;
+ };
+
+ static const struct table_entry paths_system[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_SYSTEM_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_SYSTEM_STATE_PRIVATE, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_SYSTEM_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_SYSTEM_STATE_LOGS, NULL },
+ [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_SYSTEM_CONFIGURATION, NULL },
+ };
+
+ static const struct table_entry paths_user[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = { SD_PATH_USER_RUNTIME, NULL },
+ [EXEC_DIRECTORY_STATE] = { SD_PATH_USER_STATE_PRIVATE, NULL },
+ [EXEC_DIRECTORY_CACHE] = { SD_PATH_USER_STATE_CACHE, NULL },
+ [EXEC_DIRECTORY_LOGS] = { SD_PATH_USER_STATE_PRIVATE, "log" },
+ [EXEC_DIRECTORY_CONFIGURATION] = { SD_PATH_USER_CONFIGURATION, NULL },
+ };
+
+ assert(m);
+
+ const struct table_entry *p = MANAGER_IS_SYSTEM(m) ? paths_system : paths_user;
+ int r;
+
+ for (ExecDirectoryType i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) {
+ r = sd_path_lookup(p[i].type, p[i].suffix, &m->prefix[i]);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to lookup %s path: %m",
+ exec_directory_type_to_string(i));
+ }
+
+ return 0;
+}
+
+static void manager_free_unit_name_maps(Manager *m) {
+ m->unit_id_map = hashmap_free(m->unit_id_map);
+ m->unit_name_map = hashmap_free(m->unit_name_map);
+ m->unit_path_cache = set_free(m->unit_path_cache);
+ m->unit_cache_timestamp_hash = 0;
+}
+
+static int manager_setup_run_queue(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->run_queue_event_source);
+
+ r = sd_event_add_defer(m->event, &m->run_queue_event_source, manager_dispatch_run_queue, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(m->run_queue_event_source, SD_EVENT_PRIORITY_IDLE);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(m->run_queue_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->run_queue_event_source, "manager-run-queue");
+
+ return 0;
+}
+
+static int manager_setup_sigchld_event_source(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->sigchld_event_source);
+
+ r = sd_event_add_defer(m->event, &m->sigchld_event_source, manager_dispatch_sigchld, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(m->sigchld_event_source, SD_EVENT_PRIORITY_NORMAL-7);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(m->sigchld_event_source, "manager-sigchld");
+
+ return 0;
+}
+
+int manager_setup_memory_pressure_event_source(Manager *m) {
+ int r;
+
+ assert(m);
+
+ m->memory_pressure_event_source = sd_event_source_disable_unref(m->memory_pressure_event_source);
+
+ r = sd_event_add_memory_pressure(m->event, &m->memory_pressure_event_source, NULL, NULL);
+ if (r < 0)
+ log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r,
+ "Failed to establish memory pressure event source, ignoring: %m");
+ else if (m->defaults.memory_pressure_threshold_usec != USEC_INFINITY) {
+
+ /* If there's a default memory pressure threshold set, also apply it to the service manager itself */
+ r = sd_event_source_set_memory_pressure_period(
+ m->memory_pressure_event_source,
+ m->defaults.memory_pressure_threshold_usec,
+ MEMORY_PRESSURE_DEFAULT_WINDOW_USEC);
+ if (r < 0)
+ log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m");
+ }
+
+ return 0;
+}
+
+static int manager_find_credentials_dirs(Manager *m) {
+ const char *e;
+ int r;
+
+ assert(m);
+
+ r = get_credentials_dir(&e);
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_debug_errno(r, "Failed to determine credentials directory, ignoring: %m");
+ } else {
+ m->received_credentials_directory = strdup(e);
+ if (!m->received_credentials_directory)
+ return -ENOMEM;
+ }
+
+ r = get_encrypted_credentials_dir(&e);
+ if (r < 0) {
+ if (r != -ENXIO)
+ log_debug_errno(r, "Failed to determine encrypted credentials directory, ignoring: %m");
+ } else {
+ m->received_encrypted_credentials_directory = strdup(e);
+ if (!m->received_encrypted_credentials_directory)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void manager_set_switching_root(Manager *m, bool switching_root) {
+ assert(m);
+
+ m->switching_root = MANAGER_IS_SYSTEM(m) && switching_root;
+}
+
+double manager_get_progress(Manager *m) {
+ assert(m);
+
+ if (MANAGER_IS_FINISHED(m) || m->n_installed_jobs == 0)
+ return 1.0;
+
+ return 1.0 - ((double) hashmap_size(m->jobs) / (double) m->n_installed_jobs);
+}
+
+static int compare_job_priority(const void *a, const void *b) {
+ const Job *x = a, *y = b;
+
+ return unit_compare_priority(x->unit, y->unit);
+}
+
+int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, Manager **ret) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ assert(IN_SET(runtime_scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER));
+ assert(ret);
+
+ m = new(Manager, 1);
+ if (!m)
+ return -ENOMEM;
+
+ *m = (Manager) {
+ .runtime_scope = runtime_scope,
+ .objective = _MANAGER_OBJECTIVE_INVALID,
+
+ .status_unit_format = STATUS_UNIT_FORMAT_DEFAULT,
+
+ .original_log_level = -1,
+ .original_log_target = _LOG_TARGET_INVALID,
+
+ .watchdog_overridden[WATCHDOG_RUNTIME] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_REBOOT] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_KEXEC] = USEC_INFINITY,
+ .watchdog_overridden[WATCHDOG_PRETIMEOUT] = USEC_INFINITY,
+
+ .show_status_overridden = _SHOW_STATUS_INVALID,
+
+ .notify_fd = -EBADF,
+ .cgroups_agent_fd = -EBADF,
+ .signal_fd = -EBADF,
+ .user_lookup_fds = EBADF_PAIR,
+ .private_listen_fd = -EBADF,
+ .dev_autofs_fd = -EBADF,
+ .cgroup_inotify_fd = -EBADF,
+ .pin_cgroupfs_fd = -EBADF,
+ .ask_password_inotify_fd = -EBADF,
+ .idle_pipe = { -EBADF, -EBADF, -EBADF, -EBADF},
+
+ /* start as id #1, so that we can leave #0 around as "null-like" value */
+ .current_job_id = 1,
+
+ .have_ask_password = -EINVAL, /* we don't know */
+ .first_boot = -1,
+ .test_run_flags = test_run_flags,
+
+ .dump_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_MINUTE, .burst = 10 },
+
+ .executor_fd = -EBADF,
+ };
+
+ unit_defaults_init(&m->defaults, runtime_scope);
+
+#if ENABLE_EFI
+ if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0)
+ boot_timestamps(m->timestamps + MANAGER_TIMESTAMP_USERSPACE,
+ m->timestamps + MANAGER_TIMESTAMP_FIRMWARE,
+ m->timestamps + MANAGER_TIMESTAMP_LOADER);
+#endif
+
+ /* Prepare log fields we can use for structured logging */
+ if (MANAGER_IS_SYSTEM(m)) {
+ m->unit_log_field = "UNIT=";
+ m->unit_log_format_string = "UNIT=%s";
+
+ m->invocation_log_field = "INVOCATION_ID=";
+ m->invocation_log_format_string = "INVOCATION_ID=%s";
+ } else {
+ m->unit_log_field = "USER_UNIT=";
+ m->unit_log_format_string = "USER_UNIT=%s";
+
+ m->invocation_log_field = "USER_INVOCATION_ID=";
+ m->invocation_log_format_string = "USER_INVOCATION_ID=%s";
+ }
+
+ /* Reboot immediately if the user hits C-A-D more often than 7x per 2s */
+ m->ctrl_alt_del_ratelimit = (const RateLimit) { .interval = 2 * USEC_PER_SEC, .burst = 7 };
+
+ r = manager_default_environment(m);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->units, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->cgroup_unit, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = hashmap_ensure_allocated(&m->watch_bus, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = prioq_ensure_allocated(&m->run_queue, compare_job_priority);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_prefix(m);
+ if (r < 0)
+ return r;
+
+ r = manager_find_credentials_dirs(m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_default(&m->event);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_run_queue(m);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(test_run_flags, MANAGER_TEST_RUN_MINIMAL)) {
+ m->cgroup_root = strdup("");
+ if (!m->cgroup_root)
+ return -ENOMEM;
+ } else {
+ r = manager_setup_signals(m);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_cgroup(m);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_time_change(m);
+ if (r < 0)
+ return r;
+
+ r = manager_read_timezone_stat(m);
+ if (r < 0)
+ return r;
+
+ (void) manager_setup_timezone_change(m);
+
+ r = manager_setup_sigchld_event_source(m);
+ if (r < 0)
+ return r;
+
+ r = manager_setup_memory_pressure_event_source(m);
+ if (r < 0)
+ return r;
+
+#if HAVE_LIBBPF
+ if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported(/* initialize = */ true)) {
+ r = lsm_bpf_setup(m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to setup LSM BPF, ignoring: %m");
+ }
+#endif
+ }
+
+ if (test_run_flags == 0) {
+ if (MANAGER_IS_SYSTEM(m))
+ r = mkdir_label("/run/systemd/units", 0755);
+ else {
+ _cleanup_free_ char *units_path = NULL;
+ r = xdg_user_runtime_dir(&units_path, "/systemd/units");
+ if (r < 0)
+ return r;
+ r = mkdir_p_label(units_path, 0755);
+ }
+
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ m->executor_fd = open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH);
+ if (m->executor_fd < 0)
+ return log_emergency_errno(errno,
+ "Failed to open executor binary '%s': %m",
+ SYSTEMD_EXECUTOR_BINARY_PATH);
+ } else if (!FLAGS_SET(test_run_flags, MANAGER_TEST_DONT_OPEN_EXECUTOR)) {
+ _cleanup_free_ char *self_exe = NULL, *executor_path = NULL;
+ _cleanup_close_ int self_dir_fd = -EBADF;
+ int level = LOG_DEBUG;
+
+ /* Prefer sd-executor from the same directory as the test, e.g.: when running unit tests from the
+ * build directory. Fallback to working directory and then the installation path. */
+ r = readlink_and_make_absolute("/proc/self/exe", &self_exe);
+ if (r < 0)
+ return r;
+
+ self_dir_fd = open_parent(self_exe, O_CLOEXEC|O_PATH|O_DIRECTORY, 0);
+ if (self_dir_fd < 0)
+ return self_dir_fd;
+
+ m->executor_fd = RET_NERRNO(openat(self_dir_fd, "systemd-executor", O_CLOEXEC|O_PATH));
+ if (m->executor_fd == -ENOENT)
+ m->executor_fd = RET_NERRNO(openat(AT_FDCWD, "systemd-executor", O_CLOEXEC|O_PATH));
+ if (m->executor_fd == -ENOENT) {
+ m->executor_fd = RET_NERRNO(open(SYSTEMD_EXECUTOR_BINARY_PATH, O_CLOEXEC|O_PATH));
+ level = LOG_WARNING; /* Tests should normally use local builds */
+ }
+ if (m->executor_fd < 0)
+ return m->executor_fd;
+
+ r = fd_get_path(m->executor_fd, &executor_path);
+ if (r < 0)
+ return r;
+
+ log_full(level, "Using systemd-executor binary from '%s'.", executor_path);
+ }
+
+ /* Note that we do not set up the notify fd here. We do that after deserialization,
+ * since they might have gotten serialized across the reexec. */
+
+ *ret = TAKE_PTR(m);
+
+ return 0;
+}
+
+static int manager_setup_notify(Manager *m) {
+ int r;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ if (m->notify_fd < 0) {
+ _cleanup_close_ int fd = -EBADF;
+ union sockaddr_union sa;
+ socklen_t sa_len;
+
+ /* First free all secondary fields */
+ m->notify_socket = mfree(m->notify_socket);
+ m->notify_event_source = sd_event_source_disable_unref(m->notify_event_source);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+ fd_increase_rxbuf(fd, NOTIFY_RCVBUF_SIZE);
+
+ m->notify_socket = path_join(m->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/notify");
+ if (!m->notify_socket)
+ return log_oom();
+
+ r = sockaddr_un_set_path(&sa.un, m->notify_socket);
+ if (r < 0)
+ return log_error_errno(r, "Notify socket '%s' not valid for AF_UNIX socket address, refusing.",
+ m->notify_socket);
+ sa_len = r;
+
+ (void) mkdir_parents_label(m->notify_socket, 0755);
+ (void) sockaddr_un_unlink(&sa.un);
+
+ r = mac_selinux_bind(fd, &sa.sa, sa_len);
+ if (r < 0)
+ return log_error_errno(r, "bind(%s) failed: %m", m->notify_socket);
+
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ return log_error_errno(r, "SO_PASSCRED failed: %m");
+
+ m->notify_fd = TAKE_FD(fd);
+
+ log_debug("Using notification socket %s", m->notify_socket);
+ }
+
+ if (!m->notify_event_source) {
+ r = sd_event_add_io(m->event, &m->notify_event_source, m->notify_fd, EPOLLIN, manager_dispatch_notify_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+ /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which
+ * service an exit message belongs. */
+ r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-8);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of notify event source: %m");
+
+ (void) sd_event_source_set_description(m->notify_event_source, "manager-notify");
+ }
+
+ return 0;
+}
+
+static int manager_setup_cgroups_agent(Manager *m) {
+
+ static const union sockaddr_union sa = {
+ .un.sun_family = AF_UNIX,
+ .un.sun_path = "/run/systemd/cgroups-agent",
+ };
+ int r;
+
+ /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering
+ * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and
+ * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on
+ * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number
+ * of D-Bus connections may be queued until the kernel will start dropping further incoming connections,
+ * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX
+ * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and
+ * we thus won't lose messages.
+ *
+ * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen
+ * to it. The system instance hence listens on this special socket, but the user instances listen on the system
+ * bus for these messages. */
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return 0;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether unified cgroups hierarchy is used: %m");
+ if (r > 0) /* We don't need this anymore on the unified hierarchy */
+ return 0;
+
+ if (m->cgroups_agent_fd < 0) {
+ _cleanup_close_ int fd = -EBADF;
+
+ /* First free all secondary fields */
+ m->cgroups_agent_event_source = sd_event_source_disable_unref(m->cgroups_agent_event_source);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m");
+
+ fd_increase_rxbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE);
+
+ (void) sockaddr_un_unlink(&sa.un);
+
+ /* Only allow root to connect to this socket */
+ WITH_UMASK(0077)
+ r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+ if (r < 0)
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+
+ m->cgroups_agent_fd = TAKE_FD(fd);
+ }
+
+ if (!m->cgroups_agent_event_source) {
+ r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate cgroups agent event source: %m");
+
+ /* Process cgroups notifications early. Note that when the agent notification is received
+ * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than
+ * that. Also see handling of cgroup inotify for the unified cgroup stuff. */
+ r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent");
+ }
+
+ return 0;
+}
+
+static int manager_setup_user_lookup_fd(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID
+ * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation,
+ * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked
+ * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects
+ * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes,
+ * hence we establish this communication channel so that forked off processes can pass their UID/GID
+ * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple
+ * datagram, along with their unit name, so that we can share one communication socket pair among all units for
+ * this purpose.
+ *
+ * You might wonder why we need a communication channel for this that is independent of the usual notification
+ * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET
+ * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user
+ * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available.
+ *
+ * Note that this function is called under two circumstances: when we first initialize (in which case we
+ * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload
+ * (in which case the socket pair already exists but we still need to allocate the event source for it). */
+
+ if (m->user_lookup_fds[0] < 0) {
+
+ /* Free all secondary fields */
+ safe_close_pair(m->user_lookup_fds);
+ m->user_lookup_event_source = sd_event_source_disable_unref(m->user_lookup_event_source);
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0)
+ return log_error_errno(errno, "Failed to allocate user lookup socket: %m");
+
+ (void) fd_increase_rxbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE);
+ }
+
+ if (!m->user_lookup_event_source) {
+ r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to allocate user lookup event source: %m");
+
+ /* Process even earlier than the notify event source, so that we always know first about valid UID/GID
+ * resolutions */
+ r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-11);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to set priority of user lookup event source: %m");
+
+ (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup");
+ }
+
+ return 0;
+}
+
+static unsigned manager_dispatch_cleanup_queue(Manager *m) {
+ Unit *u;
+ unsigned n = 0;
+
+ assert(m);
+
+ while ((u = m->cleanup_queue)) {
+ assert(u->in_cleanup_queue);
+
+ unit_free(u);
+ n++;
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_release_resources_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+
+ assert(m);
+
+ while ((u = LIST_POP(release_resources_queue, m->release_resources_queue))) {
+ assert(u->in_release_resources_queue);
+ u->in_release_resources_queue = false;
+
+ n++;
+
+ unit_release_resources(u);
+ }
+
+ return n;
+}
+
+enum {
+ GC_OFFSET_IN_PATH, /* This one is on the path we were traveling */
+ GC_OFFSET_UNSURE, /* No clue */
+ GC_OFFSET_GOOD, /* We still need this unit */
+ GC_OFFSET_BAD, /* We don't need this unit anymore */
+ _GC_OFFSET_MAX
+};
+
+static void unit_gc_mark_good(Unit *u, unsigned gc_marker) {
+ Unit *other;
+
+ u->gc_marker = gc_marker + GC_OFFSET_GOOD;
+
+ /* Recursively mark referenced units as GOOD as well */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCES)
+ if (other->gc_marker == gc_marker + GC_OFFSET_UNSURE)
+ unit_gc_mark_good(other, gc_marker);
+}
+
+static void unit_gc_sweep(Unit *u, unsigned gc_marker) {
+ Unit *other;
+ bool is_bad;
+
+ assert(u);
+
+ if (IN_SET(u->gc_marker - gc_marker,
+ GC_OFFSET_GOOD, GC_OFFSET_BAD, GC_OFFSET_UNSURE, GC_OFFSET_IN_PATH))
+ return;
+
+ if (u->in_cleanup_queue)
+ goto bad;
+
+ if (!unit_may_gc(u))
+ goto good;
+
+ u->gc_marker = gc_marker + GC_OFFSET_IN_PATH;
+
+ is_bad = true;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_REFERENCED_BY) {
+ unit_gc_sweep(other, gc_marker);
+
+ if (other->gc_marker == gc_marker + GC_OFFSET_GOOD)
+ goto good;
+
+ if (other->gc_marker != gc_marker + GC_OFFSET_BAD)
+ is_bad = false;
+ }
+
+ LIST_FOREACH(refs_by_target, ref, u->refs_by_target) {
+ unit_gc_sweep(ref->source, gc_marker);
+
+ if (ref->source->gc_marker == gc_marker + GC_OFFSET_GOOD)
+ goto good;
+
+ if (ref->source->gc_marker != gc_marker + GC_OFFSET_BAD)
+ is_bad = false;
+ }
+
+ if (is_bad)
+ goto bad;
+
+ /* We were unable to find anything out about this entry, so
+ * let's investigate it later */
+ u->gc_marker = gc_marker + GC_OFFSET_UNSURE;
+ unit_add_to_gc_queue(u);
+ return;
+
+bad:
+ /* We definitely know that this one is not useful anymore, so
+ * let's mark it for deletion */
+ u->gc_marker = gc_marker + GC_OFFSET_BAD;
+ unit_add_to_cleanup_queue(u);
+ return;
+
+good:
+ unit_gc_mark_good(u, gc_marker);
+}
+
+static unsigned manager_dispatch_gc_unit_queue(Manager *m) {
+ unsigned n = 0, gc_marker;
+ Unit *u;
+
+ assert(m);
+
+ /* log_debug("Running GC..."); */
+
+ m->gc_marker += _GC_OFFSET_MAX;
+ if (m->gc_marker + _GC_OFFSET_MAX <= _GC_OFFSET_MAX)
+ m->gc_marker = 1;
+
+ gc_marker = m->gc_marker;
+
+ while ((u = LIST_POP(gc_queue, m->gc_unit_queue))) {
+ assert(u->in_gc_queue);
+
+ unit_gc_sweep(u, gc_marker);
+
+ u->in_gc_queue = false;
+
+ n++;
+
+ if (IN_SET(u->gc_marker - gc_marker,
+ GC_OFFSET_BAD, GC_OFFSET_UNSURE)) {
+ if (u->id)
+ log_unit_debug(u, "Collecting.");
+ u->gc_marker = gc_marker + GC_OFFSET_BAD;
+ unit_add_to_cleanup_queue(u);
+ }
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_gc_job_queue(Manager *m) {
+ unsigned n = 0;
+ Job *j;
+
+ assert(m);
+
+ while ((j = LIST_POP(gc_queue, m->gc_job_queue))) {
+ assert(j->in_gc_queue);
+ j->in_gc_queue = false;
+
+ n++;
+
+ if (!job_may_gc(j))
+ continue;
+
+ log_unit_debug(j->unit, "Collecting job.");
+ (void) job_finish_and_invalidate(j, JOB_COLLECTED, false, false);
+ }
+
+ return n;
+}
+
+static int manager_ratelimit_requeue(sd_event_source *s, uint64_t usec, void *userdata) {
+ Unit *u = userdata;
+
+ assert(u);
+ assert(s == u->auto_start_stop_event_source);
+
+ u->auto_start_stop_event_source = sd_event_source_unref(u->auto_start_stop_event_source);
+
+ /* Re-queue to all queues, if the rate limit hit we might have been throttled on any of them. */
+ unit_submit_to_stop_when_unneeded_queue(u);
+ unit_submit_to_start_when_upheld_queue(u);
+ unit_submit_to_stop_when_bound_queue(u);
+
+ return 0;
+}
+
+static int manager_ratelimit_check_and_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (ratelimit_below(&u->auto_start_stop_ratelimit))
+ return 1;
+
+ /* Already queued, no need to requeue */
+ if (u->auto_start_stop_event_source)
+ return 0;
+
+ r = sd_event_add_time(
+ u->manager->event,
+ &u->auto_start_stop_event_source,
+ CLOCK_MONOTONIC,
+ ratelimit_end(&u->auto_start_stop_ratelimit),
+ 0,
+ manager_ratelimit_requeue,
+ u);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to queue timer on event loop: %m");
+
+ return 0;
+}
+
+static unsigned manager_dispatch_stop_when_unneeded_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = LIST_POP(stop_when_unneeded_queue, m->stop_when_unneeded_queue))) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ assert(u->in_stop_when_unneeded_queue);
+ u->in_stop_when_unneeded_queue = false;
+
+ n++;
+
+ if (!unit_is_unneeded(u))
+ continue;
+
+ log_unit_debug(u, "Unit is not needed anymore.");
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit not needed anymore, but not stopping since we tried this too often recently.%s",
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ /* Ok, nobody needs us anymore. Sniff. Then let's commit suicide */
+ r = manager_add_job(u->manager, JOB_STOP, u, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_start_when_upheld_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = LIST_POP(start_when_upheld_queue, m->start_when_upheld_queue))) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *culprit = NULL;
+
+ assert(u->in_start_when_upheld_queue);
+ u->in_start_when_upheld_queue = false;
+
+ n++;
+
+ if (!unit_is_upheld_by_active(u, &culprit))
+ continue;
+
+ log_unit_debug(u, "Unit is started because upheld by active unit %s.", culprit->id);
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit needs to be started because active unit %s upholds it, but not starting since we tried this too often recently.%s",
+ culprit->id,
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ r = manager_add_job(u->manager, JOB_START, u, JOB_FAIL, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue start job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static unsigned manager_dispatch_stop_when_bound_queue(Manager *m) {
+ unsigned n = 0;
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ while ((u = LIST_POP(stop_when_bound_queue, m->stop_when_bound_queue))) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *culprit = NULL;
+
+ assert(u->in_stop_when_bound_queue);
+ u->in_stop_when_bound_queue = false;
+
+ n++;
+
+ if (!unit_is_bound_by_inactive(u, &culprit))
+ continue;
+
+ log_unit_debug(u, "Unit is stopped because bound to inactive unit %s.", culprit->id);
+
+ /* If stopping a unit fails continuously we might enter a stop loop here, hence stop acting on the
+ * service being unnecessary after a while. */
+
+ r = manager_ratelimit_check_and_queue(u);
+ if (r <= 0) {
+ log_unit_warning(u,
+ "Unit needs to be stopped because it is bound to inactive unit %s it, but not stopping since we tried this too often recently.%s",
+ culprit->id,
+ r == 0 ? " Will retry later." : "");
+ continue;
+ }
+
+ r = manager_add_job(u->manager, JOB_STOP, u, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to enqueue stop job, ignoring: %s", bus_error_message(&error, r));
+ }
+
+ return n;
+}
+
+static void manager_clear_jobs_and_units(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ while ((u = hashmap_first(m->units)))
+ unit_free(u);
+
+ manager_dispatch_cleanup_queue(m);
+
+ assert(!m->load_queue);
+ assert(prioq_isempty(m->run_queue));
+ assert(!m->dbus_unit_queue);
+ assert(!m->dbus_job_queue);
+ assert(!m->cleanup_queue);
+ assert(!m->gc_unit_queue);
+ assert(!m->gc_job_queue);
+ assert(!m->cgroup_realize_queue);
+ assert(!m->cgroup_empty_queue);
+ assert(!m->cgroup_oom_queue);
+ assert(!m->target_deps_queue);
+ assert(!m->stop_when_unneeded_queue);
+ assert(!m->start_when_upheld_queue);
+ assert(!m->stop_when_bound_queue);
+ assert(!m->release_resources_queue);
+
+ assert(hashmap_isempty(m->jobs));
+ assert(hashmap_isempty(m->units));
+
+ m->n_on_console = 0;
+ m->n_running_jobs = 0;
+ m->n_installed_jobs = 0;
+ m->n_failed_jobs = 0;
+}
+
+Manager* manager_free(Manager *m) {
+ if (!m)
+ return NULL;
+
+ manager_clear_jobs_and_units(m);
+
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++)
+ if (unit_vtable[c]->shutdown)
+ unit_vtable[c]->shutdown(m);
+
+ /* Keep the cgroup hierarchy in place except when we know we are going down for good */
+ manager_shutdown_cgroup(m, /* delete= */ IN_SET(m->objective, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC));
+
+ lookup_paths_flush_generator(&m->lookup_paths);
+
+ bus_done(m);
+ manager_varlink_done(m);
+
+ exec_shared_runtime_vacuum(m);
+ hashmap_free(m->exec_shared_runtime_by_id);
+
+ dynamic_user_vacuum(m, false);
+ hashmap_free(m->dynamic_users);
+
+ hashmap_free(m->units);
+ hashmap_free(m->units_by_invocation_id);
+ hashmap_free(m->jobs);
+ hashmap_free(m->watch_pids);
+ hashmap_free(m->watch_pids_more);
+ hashmap_free(m->watch_bus);
+
+ prioq_free(m->run_queue);
+
+ set_free(m->startup_units);
+ set_free(m->failed_units);
+
+ sd_event_source_unref(m->signal_event_source);
+ sd_event_source_unref(m->sigchld_event_source);
+ sd_event_source_unref(m->notify_event_source);
+ sd_event_source_unref(m->cgroups_agent_event_source);
+ sd_event_source_unref(m->time_change_event_source);
+ sd_event_source_unref(m->timezone_change_event_source);
+ sd_event_source_unref(m->jobs_in_progress_event_source);
+ sd_event_source_unref(m->run_queue_event_source);
+ sd_event_source_unref(m->user_lookup_event_source);
+ sd_event_source_unref(m->memory_pressure_event_source);
+
+ safe_close(m->signal_fd);
+ safe_close(m->notify_fd);
+ safe_close(m->cgroups_agent_fd);
+ safe_close_pair(m->user_lookup_fds);
+
+ manager_close_ask_password(m);
+
+ manager_close_idle_pipe(m);
+
+ sd_event_unref(m->event);
+
+ free(m->notify_socket);
+
+ lookup_paths_free(&m->lookup_paths);
+ strv_free(m->transient_environment);
+ strv_free(m->client_environment);
+
+ hashmap_free(m->cgroup_unit);
+ manager_free_unit_name_maps(m);
+
+ free(m->switch_root);
+ free(m->switch_root_init);
+
+ unit_defaults_done(&m->defaults);
+
+ assert(hashmap_isempty(m->units_requiring_mounts_for));
+ hashmap_free(m->units_requiring_mounts_for);
+
+ hashmap_free(m->uid_refs);
+ hashmap_free(m->gid_refs);
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+ m->prefix[dt] = mfree(m->prefix[dt]);
+ free(m->received_credentials_directory);
+ free(m->received_encrypted_credentials_directory);
+
+ free(m->watchdog_pretimeout_governor);
+ free(m->watchdog_pretimeout_governor_overridden);
+
+ m->fw_ctx = fw_ctx_free(m->fw_ctx);
+
+#if BPF_FRAMEWORK
+ lsm_bpf_destroy(m->restrict_fs);
+#endif
+
+ safe_close(m->executor_fd);
+
+ return mfree(m);
+}
+
+static void manager_enumerate_perpetual(Manager *m) {
+ assert(m);
+
+ if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ return;
+
+ /* Let's ask every type to load all units from disk/kernel that it might know */
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+ if (!unit_type_supported(c)) {
+ log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+ continue;
+ }
+
+ if (unit_vtable[c]->enumerate_perpetual)
+ unit_vtable[c]->enumerate_perpetual(m);
+ }
+}
+
+static void manager_enumerate(Manager *m) {
+ assert(m);
+
+ if (FLAGS_SET(m->test_run_flags, MANAGER_TEST_RUN_MINIMAL))
+ return;
+
+ /* Let's ask every type to load all units from disk/kernel that it might know */
+ for (UnitType c = 0; c < _UNIT_TYPE_MAX; c++) {
+ if (!unit_type_supported(c)) {
+ log_debug("Unit type .%s is not supported on this system.", unit_type_to_string(c));
+ continue;
+ }
+
+ if (unit_vtable[c]->enumerate)
+ unit_vtable[c]->enumerate(m);
+ }
+
+ manager_dispatch_load_queue(m);
+}
+
+static void manager_coldplug(Manager *m) {
+ Unit *u;
+ char *k;
+ int r;
+
+ assert(m);
+
+ log_debug("Invoking unit coldplug() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ /* Let's place the units back into their deserialized state */
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ r = unit_coldplug(u);
+ if (r < 0)
+ log_warning_errno(r, "We couldn't coldplug %s, proceeding anyway: %m", u->id);
+ }
+}
+
+static void manager_catchup(Manager *m) {
+ Unit *u;
+ char *k;
+
+ assert(m);
+
+ log_debug("Invoking unit catchup() handlers%s", special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ /* Let's catch up on any state changes that happened while we were reloading/reexecing */
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+
+ /* ignore aliases */
+ if (u->id != k)
+ continue;
+
+ unit_catchup(u);
+ }
+}
+
+static void manager_distribute_fds(Manager *m, FDSet *fds) {
+ Unit *u;
+
+ assert(m);
+
+ HASHMAP_FOREACH(u, m->units) {
+
+ if (fdset_size(fds) <= 0)
+ break;
+
+ if (!UNIT_VTABLE(u)->distribute_fds)
+ continue;
+
+ UNIT_VTABLE(u)->distribute_fds(u, fds);
+ }
+}
+
+static bool manager_dbus_is_running(Manager *m, bool deserialized) {
+ Unit *u;
+
+ assert(m);
+
+ /* This checks whether the dbus instance we are supposed to expose our APIs on is up. We check both the socket
+ * and the service unit. If the 'deserialized' parameter is true we'll check the deserialized state of the unit
+ * rather than the current one. */
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return false;
+
+ u = manager_get_unit(m, SPECIAL_DBUS_SOCKET);
+ if (!u)
+ return false;
+ if ((deserialized ? SOCKET(u)->deserialized_state : SOCKET(u)->state) != SOCKET_RUNNING)
+ return false;
+
+ u = manager_get_unit(m, SPECIAL_DBUS_SERVICE);
+ if (!u)
+ return false;
+ if (!IN_SET((deserialized ? SERVICE(u)->deserialized_state : SERVICE(u)->state),
+ SERVICE_RUNNING,
+ SERVICE_RELOAD,
+ SERVICE_RELOAD_NOTIFY,
+ SERVICE_RELOAD_SIGNAL))
+ return false;
+
+ return true;
+}
+
+static void manager_setup_bus(Manager *m) {
+ assert(m);
+
+ /* Let's set up our private bus connection now, unconditionally */
+ (void) bus_init_private(m);
+
+ /* If we are in --user mode also connect to the system bus now */
+ if (MANAGER_IS_USER(m))
+ (void) bus_init_system(m);
+
+ /* Let's connect to the bus now, but only if the unit is supposed to be up */
+ if (manager_dbus_is_running(m, MANAGER_IS_RELOADING(m))) {
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ }
+}
+
+static void manager_preset_all(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (m->first_boot <= 0)
+ return;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return;
+
+ /* If this is the first boot, and we are in the host system, then preset everything */
+ UnitFilePresetMode mode =
+ ENABLE_FIRST_BOOT_FULL_PRESET ? UNIT_FILE_PRESET_FULL : UNIT_FILE_PRESET_ENABLE_ONLY;
+
+ r = unit_file_preset_all(RUNTIME_SCOPE_SYSTEM, 0, NULL, mode, NULL, 0);
+ if (r < 0)
+ log_full_errno(r == -EEXIST ? LOG_NOTICE : LOG_WARNING, r,
+ "Failed to populate /etc with preset unit settings, ignoring: %m");
+ else
+ log_info("Populated /etc with preset unit settings.");
+}
+
+static void manager_ready(Manager *m) {
+ assert(m);
+
+ /* After having loaded everything, do the final round of catching up with what might have changed */
+
+ m->objective = MANAGER_OK; /* Tell everyone we are up now */
+
+ /* It might be safe to log to the journal now and connect to dbus */
+ manager_recheck_journal(m);
+ manager_recheck_dbus(m);
+
+ /* Let's finally catch up with any changes that took place while we were reloading/reexecing */
+ manager_catchup(m);
+
+ /* Create a file which will indicate when the manager started loading units the last time. */
+ if (MANAGER_IS_SYSTEM(m))
+ (void) touch_file("/run/systemd/systemd-units-load", false,
+ m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].realtime ?: now(CLOCK_REALTIME),
+ UID_INVALID, GID_INVALID, 0444);
+}
+
+Manager* manager_reloading_start(Manager *m) {
+ m->n_reloading++;
+ dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_UNITS_LOAD);
+ return m;
+}
+
+void manager_reloading_stopp(Manager **m) {
+ if (*m) {
+ assert((*m)->n_reloading > 0);
+ (*m)->n_reloading--;
+ }
+}
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root) {
+ int r;
+
+ assert(m);
+
+ /* If we are running in test mode, we still want to run the generators,
+ * but we should not touch the real generator directories. */
+ r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope,
+ MANAGER_IS_TEST_RUN(m) ? LOOKUP_PATHS_TEMPORARY_GENERATED : 0,
+ root);
+ if (r < 0)
+ return r;
+
+ dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_START));
+ r = manager_run_environment_generators(m);
+ if (r >= 0)
+ r = manager_run_generators(m);
+ dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_GENERATORS_FINISH));
+ if (r < 0)
+ return r;
+
+ manager_preset_all(m);
+
+ lookup_paths_log(&m->lookup_paths);
+
+ {
+ /* This block is (optionally) done with the reloading counter bumped */
+ _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+
+ /* Make sure we don't have a left-over from a previous run */
+ if (!serialization)
+ (void) rm_rf(m->lookup_paths.transient, 0);
+
+ /* If we will deserialize make sure that during enumeration this is already known, so we increase the
+ * counter here already */
+ if (serialization)
+ reloading = manager_reloading_start(m);
+
+ /* First, enumerate what we can from all config files */
+ dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_START));
+ manager_enumerate_perpetual(m);
+ manager_enumerate(m);
+ dual_timestamp_now(m->timestamps + manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_UNITS_LOAD_FINISH));
+
+ /* Second, deserialize if there is something to deserialize */
+ if (serialization) {
+ r = manager_deserialize(m, serialization, fds);
+ if (r < 0)
+ return log_error_errno(r, "Deserialization failed: %m");
+ }
+
+ /* Any fds left? Find some unit which wants them. This is useful to allow container managers to pass
+ * some file descriptors to us pre-initialized. This enables socket-based activation of entire
+ * containers. */
+ manager_distribute_fds(m, fds);
+
+ /* We might have deserialized the notify fd, but if we didn't then let's create the bus now */
+ r = manager_setup_notify(m);
+ if (r < 0)
+ /* No sense to continue without notifications, our children would fail anyway. */
+ return r;
+
+ r = manager_setup_cgroups_agent(m);
+ if (r < 0)
+ /* Likewise, no sense to continue without empty cgroup notifications. */
+ return r;
+
+ r = manager_setup_user_lookup_fd(m);
+ if (r < 0)
+ /* This shouldn't fail, except if things are really broken. */
+ return r;
+
+ /* Connect to the bus if we are good for it */
+ manager_setup_bus(m);
+
+ /* Now that we are connected to all possible buses, let's deserialize who is tracking us. */
+ r = bus_track_coldplug(m, &m->subscribed, false, m->deserialized_subscribed);
+ if (r < 0)
+ log_warning_errno(r, "Failed to deserialized tracked clients, ignoring: %m");
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+ r = manager_varlink_init(m);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set up Varlink, ignoring: %m");
+
+ /* Third, fire things up! */
+ manager_coldplug(m);
+
+ /* Clean up runtime objects */
+ manager_vacuum(m);
+
+ if (serialization)
+ /* Let's wait for the UnitNew/JobNew messages being sent, before we notify that the
+ * reload is finished */
+ m->send_reloading_done = true;
+ }
+
+ manager_ready(m);
+
+ manager_set_switching_root(m, false);
+
+ return 0;
+}
+
+int manager_add_job(
+ Manager *m,
+ JobType type,
+ Unit *unit,
+ JobMode mode,
+ Set *affected_jobs,
+ sd_bus_error *error,
+ Job **ret) {
+
+ _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL;
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(unit);
+ assert(mode < _JOB_MODE_MAX);
+
+ if (mode == JOB_ISOLATE && type != JOB_START)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Isolate is only valid for start.");
+
+ if (mode == JOB_ISOLATE && !unit->allow_isolate)
+ return sd_bus_error_set(error, BUS_ERROR_NO_ISOLATION, "Operation refused, unit may not be isolated.");
+
+ if (mode == JOB_TRIGGERING && type != JOB_STOP)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=triggering is only valid for stop.");
+
+ if (mode == JOB_RESTART_DEPENDENCIES && type != JOB_START)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "--job-mode=restart-dependencies is only valid for start.");
+
+ log_unit_debug(unit, "Trying to enqueue job %s/%s/%s", unit->id, job_type_to_string(type), job_mode_to_string(mode));
+
+ type = job_type_collapse(type, unit);
+
+ tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+ if (!tr)
+ return -ENOMEM;
+
+ r = transaction_add_job_and_dependencies(
+ tr,
+ type,
+ unit,
+ /* by= */ NULL,
+ TRANSACTION_MATTERS |
+ (IN_SET(mode, JOB_IGNORE_DEPENDENCIES, JOB_IGNORE_REQUIREMENTS) ? TRANSACTION_IGNORE_REQUIREMENTS : 0) |
+ (mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0) |
+ (mode == JOB_RESTART_DEPENDENCIES ? TRANSACTION_PROPAGATE_START_AS_RESTART : 0),
+ error);
+ if (r < 0)
+ return r;
+
+ if (mode == JOB_ISOLATE) {
+ r = transaction_add_isolate_jobs(tr, m);
+ if (r < 0)
+ return r;
+ }
+
+ if (mode == JOB_TRIGGERING) {
+ r = transaction_add_triggering_jobs(tr, unit);
+ if (r < 0)
+ return r;
+ }
+
+ r = transaction_activate(tr, m, mode, affected_jobs, error);
+ if (r < 0)
+ return r;
+
+ log_unit_debug(unit,
+ "Enqueued job %s/%s as %u", unit->id,
+ job_type_to_string(type), (unsigned) tr->anchor_job->id);
+
+ if (ret)
+ *ret = tr->anchor_job;
+
+ tr = transaction_free(tr);
+ return 0;
+}
+
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **ret) {
+ Unit *unit = NULL; /* just to appease gcc, initialization is not really necessary */
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(name);
+ assert(mode < _JOB_MODE_MAX);
+
+ r = manager_load_unit(m, name, NULL, NULL, &unit);
+ if (r < 0)
+ return r;
+ assert(unit);
+
+ return manager_add_job(m, type, unit, mode, affected_jobs, e, ret);
+}
+
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(m);
+ assert(type < _JOB_TYPE_MAX);
+ assert(name);
+ assert(mode < _JOB_MODE_MAX);
+
+ r = manager_add_job_by_name(m, type, name, mode, affected_jobs, &error, ret);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enqueue %s job for %s: %s", job_mode_to_string(mode), name, bus_error_message(&error, r));
+
+ return r;
+}
+
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e) {
+ int r;
+ _cleanup_(transaction_abort_and_freep) Transaction *tr = NULL;
+
+ assert(m);
+ assert(unit);
+ assert(mode < _JOB_MODE_MAX);
+ assert(mode != JOB_ISOLATE); /* Isolate is only valid for start */
+
+ tr = transaction_new(mode == JOB_REPLACE_IRREVERSIBLY);
+ if (!tr)
+ return -ENOMEM;
+
+ /* We need an anchor job */
+ r = transaction_add_job_and_dependencies(tr, JOB_NOP, unit, NULL, TRANSACTION_IGNORE_REQUIREMENTS|TRANSACTION_IGNORE_ORDER, e);
+ if (r < 0)
+ return r;
+
+ /* Failure in adding individual dependencies is ignored, so this always succeeds. */
+ transaction_add_propagate_reload_jobs(
+ tr,
+ unit,
+ tr->anchor_job,
+ mode == JOB_IGNORE_DEPENDENCIES ? TRANSACTION_IGNORE_ORDER : 0);
+
+ r = transaction_activate(tr, m, mode, NULL, e);
+ if (r < 0)
+ return r;
+
+ tr = transaction_free(tr);
+ return 0;
+}
+
+Job *manager_get_job(Manager *m, uint32_t id) {
+ assert(m);
+
+ return hashmap_get(m->jobs, UINT32_TO_PTR(id));
+}
+
+Unit *manager_get_unit(Manager *m, const char *name) {
+ assert(m);
+ assert(name);
+
+ return hashmap_get(m->units, name);
+}
+
+static int manager_dispatch_target_deps_queue(Manager *m) {
+ Unit *u;
+ int r = 0;
+
+ assert(m);
+
+ while ((u = LIST_POP(target_deps_queue, m->target_deps_queue))) {
+ _cleanup_free_ Unit **targets = NULL;
+ int n_targets;
+
+ assert(u->in_target_deps_queue);
+
+ u->in_target_deps_queue = false;
+
+ /* Take an "atomic" snapshot of dependencies here, as the call below will likely modify the
+ * dependencies, and we can't have it that hash tables we iterate through are modified while
+ * we are iterating through them. */
+ n_targets = unit_get_dependency_array(u, UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES, &targets);
+ if (n_targets < 0)
+ return n_targets;
+
+ for (int i = 0; i < n_targets; i++) {
+ r = unit_add_default_target_dependency(u, targets[i]);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return r;
+}
+
+unsigned manager_dispatch_load_queue(Manager *m) {
+ Unit *u;
+ unsigned n = 0;
+
+ assert(m);
+
+ /* Make sure we are not run recursively */
+ if (m->dispatching_load_queue)
+ return 0;
+
+ m->dispatching_load_queue = true;
+
+ /* Dispatches the load queue. Takes a unit from the queue and
+ * tries to load its data until the queue is empty */
+
+ while ((u = m->load_queue)) {
+ assert(u->in_load_queue);
+
+ unit_load(u);
+ n++;
+ }
+
+ m->dispatching_load_queue = false;
+
+ /* Dispatch the units waiting for their target dependencies to be added now, as all targets that we know about
+ * should be loaded and have aliases resolved */
+ (void) manager_dispatch_target_deps_queue(m);
+
+ return n;
+}
+
+bool manager_unit_cache_should_retry_load(Unit *u) {
+ assert(u);
+
+ /* Automatic reloading from disk only applies to units which were not found sometime in the past, and
+ * the not-found stub is kept pinned in the unit graph by dependencies. For units that were
+ * previously loaded, we don't do automatic reloading, and daemon-reload is necessary to update. */
+ if (u->load_state != UNIT_NOT_FOUND)
+ return false;
+
+ /* The cache has been updated since the last time we tried to load the unit. There might be new
+ * fragment paths to read. */
+ if (u->manager->unit_cache_timestamp_hash != u->fragment_not_found_timestamp_hash)
+ return true;
+
+ /* The cache needs to be updated because there are modifications on disk. */
+ return !lookup_paths_timestamp_hash_same(&u->manager->lookup_paths, u->manager->unit_cache_timestamp_hash, NULL);
+}
+
+int manager_load_unit_prepare(
+ Manager *m,
+ const char *name,
+ const char *path,
+ sd_bus_error *e,
+ Unit **ret) {
+
+ _cleanup_(unit_freep) Unit *cleanup_unit = NULL;
+ _cleanup_free_ char *nbuf = NULL;
+ int r;
+
+ assert(m);
+ assert(ret);
+ assert(name || path);
+
+ /* This will prepare the unit for loading, but not actually load anything from disk. */
+
+ if (path && !path_is_absolute(path))
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path %s is not absolute.", path);
+
+ if (!name) {
+ r = path_extract_filename(path, &nbuf);
+ if (r < 0)
+ return r;
+ if (r == O_DIRECTORY)
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Path '%s' refers to directory, refusing.", path);
+
+ name = nbuf;
+ }
+
+ UnitType t = unit_name_to_type(name);
+
+ if (t == _UNIT_TYPE_INVALID || !unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE))
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is missing the instance name.", name);
+
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS, "Unit name %s is not valid.", name);
+ }
+
+ Unit *unit = manager_get_unit(m, name);
+ if (unit) {
+ /* The time-based cache allows to start new units without daemon-reload,
+ * but if they are already referenced (because of dependencies or ordering)
+ * then we have to force a load of the fragment. As an optimization, check
+ * first if anything in the usual paths was modified since the last time
+ * the cache was loaded. Also check if the last time an attempt to load the
+ * unit was made was before the most recent cache refresh, so that we know
+ * we need to try again — even if the cache is current, it might have been
+ * updated in a different context before we had a chance to retry loading
+ * this particular unit. */
+ if (manager_unit_cache_should_retry_load(unit))
+ unit->load_state = UNIT_STUB;
+ else {
+ *ret = unit;
+ return 0; /* The unit was already loaded */
+ }
+ } else {
+ unit = cleanup_unit = unit_new(m, unit_vtable[t]->object_size);
+ if (!unit)
+ return -ENOMEM;
+ }
+
+ if (path) {
+ r = free_and_strdup(&unit->fragment_path, path);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_name(unit, name);
+ if (r < 0)
+ return r;
+
+ unit_add_to_load_queue(unit);
+ unit_add_to_dbus_queue(unit);
+ unit_add_to_gc_queue(unit);
+
+ *ret = unit;
+ TAKE_PTR(cleanup_unit);
+
+ return 1; /* The unit was added the load queue */
+}
+
+int manager_load_unit(
+ Manager *m,
+ const char *name,
+ const char *path,
+ sd_bus_error *e,
+ Unit **ret) {
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* This will load the unit config, but not actually start any services or anything. */
+
+ r = manager_load_unit_prepare(m, name, path, e, ret);
+ if (r <= 0)
+ return r;
+
+ /* Unit was newly loaded */
+ manager_dispatch_load_queue(m);
+ *ret = unit_follow_merge(*ret);
+ return 0;
+}
+
+int manager_load_startable_unit_or_warn(
+ Manager *m,
+ const char *name,
+ const char *path,
+ Unit **ret) {
+
+ /* Load a unit, make sure it loaded fully and is not masked. */
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *unit;
+ int r;
+
+ r = manager_load_unit(m, name, path, &error, &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load %s %s: %s",
+ name ? "unit" : "unit file", name ?: path,
+ bus_error_message(&error, r));
+
+ r = bus_unit_validate_load_state(unit, &error);
+ if (r < 0)
+ return log_error_errno(r, "%s", bus_error_message(&error, r));
+
+ *ret = unit;
+ return 0;
+}
+
+void manager_clear_jobs(Manager *m) {
+ Job *j;
+
+ assert(m);
+
+ while ((j = hashmap_first(m->jobs)))
+ /* No need to recurse. We're cancelling all jobs. */
+ job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+}
+
+void manager_unwatch_pidref(Manager *m, PidRef *pid) {
+ assert(m);
+
+ for (;;) {
+ Unit *u;
+
+ u = manager_get_unit_by_pidref_watching(m, pid);
+ if (!u)
+ break;
+
+ unit_unwatch_pidref(u, pid);
+ }
+}
+
+static int manager_dispatch_run_queue(sd_event_source *source, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+
+ assert(source);
+
+ while ((j = prioq_peek(m->run_queue))) {
+ assert(j->installed);
+ assert(j->in_run_queue);
+
+ (void) job_run_and_invalidate(j);
+ }
+
+ if (m->n_running_jobs > 0)
+ manager_watch_jobs_in_progress(m);
+
+ if (m->n_on_console > 0)
+ manager_watch_idle_pipe(m);
+
+ return 1;
+}
+
+void manager_trigger_run_queue(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = sd_event_source_set_enabled(
+ m->run_queue_event_source,
+ prioq_isempty(m->run_queue) ? SD_EVENT_OFF : SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable job run queue event source, ignoring: %m");
+}
+
+static unsigned manager_dispatch_dbus_queue(Manager *m) {
+ unsigned n = 0, budget;
+ Unit *u;
+ Job *j;
+
+ assert(m);
+
+ /* When we are reloading, let's not wait with generating signals, since we need to exit the manager as quickly
+ * as we can. There's no point in throttling generation of signals in that case. */
+ if (MANAGER_IS_RELOADING(m) || m->send_reloading_done || m->pending_reload_message)
+ budget = UINT_MAX; /* infinite budget in this case */
+ else {
+ /* Anything to do at all? */
+ if (!m->dbus_unit_queue && !m->dbus_job_queue)
+ return 0;
+
+ /* Do we have overly many messages queued at the moment? If so, let's not enqueue more on top, let's
+ * sit this cycle out, and process things in a later cycle when the queues got a bit emptier. */
+ if (manager_bus_n_queued_write(m) > MANAGER_BUS_BUSY_THRESHOLD)
+ return 0;
+
+ /* Only process a certain number of units/jobs per event loop iteration. Even if the bus queue wasn't
+ * overly full before this call we shouldn't increase it in size too wildly in one step, and we
+ * shouldn't monopolize CPU time with generating these messages. Note the difference in counting of
+ * this "budget" and the "threshold" above: the "budget" is decreased only once per generated message,
+ * regardless how many buses/direct connections it is enqueued on, while the "threshold" is applied to
+ * each queued instance of bus message, i.e. if the same message is enqueued to five buses/direct
+ * connections it will be counted five times. This difference in counting ("references"
+ * vs. "instances") is primarily a result of the fact that it's easier to implement it this way,
+ * however it also reflects the thinking that the "threshold" should put a limit on used queue memory,
+ * i.e. space, while the "budget" should put a limit on time. Also note that the "threshold" is
+ * currently chosen much higher than the "budget". */
+ budget = MANAGER_BUS_MESSAGE_BUDGET;
+ }
+
+ while (budget != 0 && (u = m->dbus_unit_queue)) {
+
+ assert(u->in_dbus_queue);
+
+ bus_unit_send_change_signal(u);
+ n++;
+
+ if (budget != UINT_MAX)
+ budget--;
+ }
+
+ while (budget != 0 && (j = m->dbus_job_queue)) {
+ assert(j->in_dbus_queue);
+
+ bus_job_send_change_signal(j);
+ n++;
+
+ if (budget != UINT_MAX)
+ budget--;
+ }
+
+ if (m->send_reloading_done) {
+ m->send_reloading_done = false;
+ bus_manager_send_reloading(m, false);
+ n++;
+ }
+
+ if (m->pending_reload_message) {
+ bus_send_pending_reload_message(m);
+ n++;
+ }
+
+ return n;
+}
+
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = userdata;
+ char buf[PATH_MAX];
+ ssize_t n;
+
+ n = recv(fd, buf, sizeof(buf), 0);
+ if (n < 0)
+ return log_error_errno(errno, "Failed to read cgroups agent message: %m");
+ if (n == 0) {
+ log_error("Got zero-length cgroups agent message, ignoring.");
+ return 0;
+ }
+ if ((size_t) n >= sizeof(buf)) {
+ log_error("Got overly long cgroups agent message, ignoring.");
+ return 0;
+ }
+
+ if (memchr(buf, 0, n)) {
+ log_error("Got cgroups agent message with embedded NUL byte, ignoring.");
+ return 0;
+ }
+ buf[n] = 0;
+
+ manager_notify_cgroup_empty(m, buf);
+ (void) bus_forward_agent_released(m, buf);
+
+ return 0;
+}
+
+static bool manager_process_barrier_fd(char * const *tags, FDSet *fds) {
+
+ /* nothing else must be sent when using BARRIER=1 */
+ if (strv_contains(tags, "BARRIER=1")) {
+ if (strv_length(tags) != 1)
+ log_warning("Extra notification messages sent with BARRIER=1, ignoring everything.");
+ else if (fdset_size(fds) != 1)
+ log_warning("Got incorrect number of fds with BARRIER=1, closing them.");
+
+ /* Drop the message if BARRIER=1 was found */
+ return true;
+ }
+
+ return false;
+}
+
+static void manager_invoke_notify_message(
+ Manager *m,
+ Unit *u,
+ const struct ucred *ucred,
+ char * const *tags,
+ FDSet *fds) {
+
+ assert(m);
+ assert(u);
+ assert(ucred);
+ assert(tags);
+
+ if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */
+ return;
+ u->notifygen = m->notifygen;
+
+ if (UNIT_VTABLE(u)->notify_message)
+ UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds);
+
+ else if (DEBUG_LOGGING) {
+ _cleanup_free_ char *buf = NULL, *x = NULL, *y = NULL;
+
+ buf = strv_join(tags, ", ");
+ if (buf)
+ x = ellipsize(buf, 20, 90);
+ if (x)
+ y = cescape(x);
+
+ log_unit_debug(u, "Got notification message \"%s\", ignoring.", strnull(y));
+ }
+}
+
+static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ Manager *m = ASSERT_PTR(userdata);
+ char buf[NOTIFY_BUFFER_MAX+1];
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
+ CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+
+ struct cmsghdr *cmsg;
+ struct ucred *ucred = NULL;
+ _cleanup_free_ Unit **array_copy = NULL;
+ _cleanup_strv_free_ char **tags = NULL;
+ Unit *u1, *u2, **array;
+ int r, *fd_array = NULL;
+ size_t n_fds = 0;
+ bool found = false;
+ ssize_t n;
+
+ assert(m->notify_fd == fd);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for notify fd.");
+ return 0;
+ }
+
+ n = recvmsg_safe(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
+ if (ERRNO_IS_NEG_TRANSIENT(n))
+ return 0; /* Spurious wakeup, try again */
+ if (n == -EXFULL) {
+ log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
+ return 0;
+ }
+ if (n < 0)
+ /* If this is any other, real error, then stop processing this socket. This of course means
+ * we won't take notification messages anymore, but that's still better than busy looping:
+ * being woken up over and over again, but being unable to actually read the message from the
+ * socket. */
+ return log_error_errno(n, "Failed to receive notification message: %m");
+
+ CMSG_FOREACH(cmsg, &msghdr)
+ if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+
+ assert(!fd_array);
+ fd_array = CMSG_TYPED_DATA(cmsg, int);
+ n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ } else if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_CREDENTIALS &&
+ cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+ assert(!ucred);
+ ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
+ }
+
+ if (n_fds > 0) {
+ assert(fd_array);
+
+ r = fdset_new_array(&fds, fd_array, n_fds);
+ if (r < 0) {
+ close_many(fd_array, n_fds);
+ log_oom();
+ return 0;
+ }
+ }
+
+ if (!ucred || !pid_is_valid(ucred->pid)) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ if ((size_t) n >= sizeof(buf) || (msghdr.msg_flags & MSG_TRUNC)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ /* As extra safety check, let's make sure the string we get doesn't contain embedded NUL bytes.
+ * We permit one trailing NUL byte in the message, but don't expect it. */
+ if (n > 1 && memchr(buf, 0, n-1)) {
+ log_warning("Received notify message with embedded NUL bytes. Ignoring.");
+ return 0;
+ }
+
+ /* Make sure it's NUL-terminated, then parse it to obtain the tags list. */
+ buf[n] = 0;
+ tags = strv_split_newlines(buf);
+ if (!tags) {
+ log_oom();
+ return 0;
+ }
+
+ /* Possibly a barrier fd, let's see. */
+ if (manager_process_barrier_fd(tags, fds)) {
+ log_debug("Received barrier notification message from PID " PID_FMT ".", ucred->pid);
+ return 0;
+ }
+
+ /* Increase the generation counter used for filtering out duplicate unit invocations. */
+ m->notifygen++;
+
+ /* Generate lookup key from the PID (we have no pidfd here, after all) */
+ PidRef pidref = PIDREF_MAKE_FROM_PID(ucred->pid);
+
+ /* Notify every unit that might be interested, which might be multiple. */
+ u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
+ u2 = hashmap_get(m->watch_pids, &pidref);
+ array = hashmap_get(m->watch_pids_more, &pidref);
+ if (array) {
+ size_t k = 0;
+
+ while (array[k])
+ k++;
+
+ array_copy = newdup(Unit*, array, k+1);
+ if (!array_copy)
+ log_oom();
+ }
+ /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle
+ * duplicate units make sure we only invoke each unit's handler once. */
+ if (u1) {
+ manager_invoke_notify_message(m, u1, ucred, tags, fds);
+ found = true;
+ }
+ if (u2) {
+ manager_invoke_notify_message(m, u2, ucred, tags, fds);
+ found = true;
+ }
+ if (array_copy)
+ for (size_t i = 0; array_copy[i]; i++) {
+ manager_invoke_notify_message(m, array_copy[i], ucred, tags, fds);
+ found = true;
+ }
+
+ if (!found)
+ log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
+
+ if (fdset_size(fds) > 0)
+ log_warning("Got extra auxiliary fds with notification message, closing them.");
+
+ return 0;
+}
+
+static void manager_invoke_sigchld_event(
+ Manager *m,
+ Unit *u,
+ const siginfo_t *si) {
+
+ assert(m);
+ assert(u);
+ assert(si);
+
+ /* Already invoked the handler of this unit in this iteration? Then don't process this again */
+ if (u->sigchldgen == m->sigchldgen)
+ return;
+ u->sigchldgen = m->sigchldgen;
+
+ log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id);
+ unit_unwatch_pid(u, si->si_pid);
+
+ if (UNIT_VTABLE(u)->sigchld_event)
+ UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
+}
+
+static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ siginfo_t si = {};
+ int r;
+
+ assert(source);
+
+ /* First we call waitid() for a PID and do not reap the zombie. That way we can still access
+ * /proc/$PID for it while it is a zombie. */
+
+ if (waitid(P_ALL, 0, &si, WEXITED|WNOHANG|WNOWAIT) < 0) {
+
+ if (errno != ECHILD)
+ log_error_errno(errno, "Failed to peek for child with waitid(), ignoring: %m");
+
+ goto turn_off;
+ }
+
+ if (si.si_pid <= 0)
+ goto turn_off;
+
+ if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) {
+ _cleanup_free_ Unit **array_copy = NULL;
+ _cleanup_free_ char *name = NULL;
+ Unit *u1, *u2, **array;
+
+ (void) pid_get_comm(si.si_pid, &name);
+
+ log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)",
+ si.si_pid, strna(name),
+ sigchld_code_to_string(si.si_code),
+ si.si_status,
+ strna(si.si_code == CLD_EXITED
+ ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
+ : signal_to_string(si.si_status)));
+
+ /* Increase the generation counter used for filtering out duplicate unit invocations */
+ m->sigchldgen++;
+
+ /* We look this up by a PidRef that only consists of the PID. After all we couldn't create a
+ * pidfd here any more even if we wanted (since the process just exited). */
+ PidRef pidref = PIDREF_MAKE_FROM_PID(si.si_pid);
+
+ /* And now figure out the unit this belongs to, it might be multiple... */
+ u1 = manager_get_unit_by_pidref_cgroup(m, &pidref);
+ u2 = hashmap_get(m->watch_pids, &pidref);
+ array = hashmap_get(m->watch_pids_more, &pidref);
+ if (array) {
+ size_t n = 0;
+
+ /* Count how many entries the array has */
+ while (array[n])
+ n++;
+
+ /* Make a copy of the array so that we don't trip up on the array changing beneath us */
+ array_copy = newdup(Unit*, array, n+1);
+ if (!array_copy)
+ log_oom();
+ }
+
+ /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but
+ * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for
+ * each iteration. */
+ if (u1) {
+ /* We check for oom condition, in case we got SIGCHLD before the oom notification.
+ * We only do this for the cgroup the PID belonged to. */
+ (void) unit_check_oom(u1);
+
+ /* We check if systemd-oomd performed a kill so that we log and notify appropriately */
+ (void) unit_check_oomd_kill(u1);
+
+ manager_invoke_sigchld_event(m, u1, &si);
+ }
+ if (u2)
+ manager_invoke_sigchld_event(m, u2, &si);
+ if (array_copy)
+ for (size_t i = 0; array_copy[i]; i++)
+ manager_invoke_sigchld_event(m, array_copy[i], &si);
+ }
+
+ /* And now, we actually reap the zombie. */
+ if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) {
+ log_error_errno(errno, "Failed to dequeue child, ignoring: %m");
+ return 0;
+ }
+
+ return 0;
+
+turn_off:
+ /* All children processed for now, turn off event source */
+
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable SIGCHLD event source: %m");
+
+ return 0;
+}
+
+static void manager_start_special(Manager *m, const char *name, JobMode mode) {
+ Job *job;
+
+ if (manager_add_job_by_name_and_warn(m, JOB_START, name, mode, NULL, &job) < 0)
+ return;
+
+ const char *s = unit_status_string(job->unit, NULL);
+
+ log_info("Activating special unit %s...", s);
+
+ sd_notifyf(false,
+ "STATUS=Activating special unit %s...", s);
+ m->status_ready = false;
+}
+
+static void manager_handle_ctrl_alt_del(Manager *m) {
+ /* If the user presses C-A-D more than
+ * 7 times within 2s, we reboot/shutdown immediately,
+ * unless it was disabled in system.conf */
+
+ if (ratelimit_below(&m->ctrl_alt_del_ratelimit) || m->cad_burst_action == EMERGENCY_ACTION_NONE)
+ manager_start_special(m, SPECIAL_CTRL_ALT_DEL_TARGET, JOB_REPLACE_IRREVERSIBLY);
+ else
+ emergency_action(m, m->cad_burst_action, EMERGENCY_ACTION_WARN, NULL, -1,
+ "Ctrl-Alt-Del was pressed more than 7 times within 2s");
+}
+
+static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ ssize_t n;
+ struct signalfd_siginfo sfsi;
+ int r;
+
+ assert(m->signal_fd == fd);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected events from signal file descriptor.");
+ return 0;
+ }
+
+ n = read(m->signal_fd, &sfsi, sizeof(sfsi));
+ if (n < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ /* We return an error here, which will kill this handler,
+ * to avoid a busy loop on read error. */
+ return log_error_errno(errno, "Reading from signal fd failed: %m");
+ }
+ if (n != sizeof(sfsi)) {
+ log_warning("Truncated read from signal fd (%zi bytes), ignoring!", n);
+ return 0;
+ }
+
+ log_received_signal(sfsi.ssi_signo == SIGCHLD ||
+ (sfsi.ssi_signo == SIGTERM && MANAGER_IS_USER(m))
+ ? LOG_DEBUG : LOG_INFO,
+ &sfsi);
+
+ switch (sfsi.ssi_signo) {
+
+ case SIGCHLD:
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable SIGCHLD event source, ignoring: %m");
+
+ break;
+
+ case SIGTERM:
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* This is for compatibility with the original sysvinit */
+ if (verify_run_space_and_log("Refusing to reexecute") < 0)
+ break;
+
+ m->objective = MANAGER_REEXECUTE;
+ break;
+ }
+
+ _fallthrough_;
+ case SIGINT:
+ if (MANAGER_IS_SYSTEM(m))
+ manager_handle_ctrl_alt_del(m);
+ else
+ manager_start_special(m, SPECIAL_EXIT_TARGET, JOB_REPLACE_IRREVERSIBLY);
+ break;
+
+ case SIGWINCH:
+ /* This is a nop on non-init */
+ if (MANAGER_IS_SYSTEM(m))
+ manager_start_special(m, SPECIAL_KBREQUEST_TARGET, JOB_REPLACE);
+
+ break;
+
+ case SIGPWR:
+ /* This is a nop on non-init */
+ if (MANAGER_IS_SYSTEM(m))
+ manager_start_special(m, SPECIAL_SIGPWR_TARGET, JOB_REPLACE);
+
+ break;
+
+ case SIGUSR1:
+ if (manager_dbus_is_running(m, false)) {
+ log_info("Trying to reconnect to bus...");
+
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ } else
+ manager_start_special(m, SPECIAL_DBUS_SERVICE, JOB_REPLACE);
+
+ break;
+
+ case SIGUSR2: {
+ _cleanup_free_ char *dump = NULL;
+
+ r = manager_get_dump_string(m, /* patterns= */ NULL, &dump);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to acquire manager dump: %m");
+ break;
+ }
+
+ log_dump(LOG_INFO, dump);
+ break;
+ }
+
+ case SIGHUP:
+ if (verify_run_space_and_log("Refusing to reload") < 0)
+ break;
+
+ m->objective = MANAGER_RELOAD;
+ break;
+
+ default: {
+
+ /* Starting SIGRTMIN+0 */
+ static const struct {
+ const char *target;
+ JobMode mode;
+ } target_table[] = {
+ [0] = { SPECIAL_DEFAULT_TARGET, JOB_ISOLATE },
+ [1] = { SPECIAL_RESCUE_TARGET, JOB_ISOLATE },
+ [2] = { SPECIAL_EMERGENCY_TARGET, JOB_ISOLATE },
+ [3] = { SPECIAL_HALT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [4] = { SPECIAL_POWEROFF_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [5] = { SPECIAL_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [6] = { SPECIAL_KEXEC_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ [7] = { SPECIAL_SOFT_REBOOT_TARGET, JOB_REPLACE_IRREVERSIBLY },
+ };
+
+ /* Starting SIGRTMIN+13, so that target halt and system halt are 10 apart */
+ static const ManagerObjective objective_table[] = {
+ [0] = MANAGER_HALT,
+ [1] = MANAGER_POWEROFF,
+ [2] = MANAGER_REBOOT,
+ [3] = MANAGER_KEXEC,
+ [4] = MANAGER_SOFT_REBOOT,
+ };
+
+ if ((int) sfsi.ssi_signo >= SIGRTMIN+0 &&
+ (int) sfsi.ssi_signo < SIGRTMIN+(int) ELEMENTSOF(target_table)) {
+ int idx = (int) sfsi.ssi_signo - SIGRTMIN;
+ manager_start_special(m, target_table[idx].target, target_table[idx].mode);
+ break;
+ }
+
+ if ((int) sfsi.ssi_signo >= SIGRTMIN+13 &&
+ (int) sfsi.ssi_signo < SIGRTMIN+13+(int) ELEMENTSOF(objective_table)) {
+ m->objective = objective_table[sfsi.ssi_signo - SIGRTMIN - 13];
+ break;
+ }
+
+ switch (sfsi.ssi_signo - SIGRTMIN) {
+
+ case 18: {
+ bool generic = false;
+
+ if (sfsi.ssi_code != SI_QUEUE)
+ generic = true;
+ else {
+ /* Override a few select commands by our own PID1-specific logic */
+
+ switch (sfsi.ssi_int) {
+
+ case _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE..._COMMON_SIGNAL_COMMAND_LOG_LEVEL_END:
+ manager_override_log_level(m, sfsi.ssi_int - _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_CONSOLE:
+ manager_override_log_target(m, LOG_TARGET_CONSOLE);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_JOURNAL:
+ manager_override_log_target(m, LOG_TARGET_JOURNAL);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_KMSG:
+ manager_override_log_target(m, LOG_TARGET_KMSG);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_NULL:
+ manager_override_log_target(m, LOG_TARGET_NULL);
+ break;
+
+ case MANAGER_SIGNAL_COMMAND_DUMP_JOBS: {
+ _cleanup_free_ char *dump_jobs = NULL;
+
+ r = manager_get_dump_jobs_string(m, /* patterns= */ NULL, " ", &dump_jobs);
+ if (r < 0) {
+ log_warning_errno(errno, "Failed to acquire manager jobs dump: %m");
+ break;
+ }
+
+ log_dump(LOG_INFO, dump_jobs);
+ break;
+ }
+
+ default:
+ generic = true;
+ }
+ }
+
+ if (generic)
+ return sigrtmin18_handler(source, &sfsi, NULL);
+
+ break;
+ }
+
+ case 20:
+ manager_override_show_status(m, SHOW_STATUS_YES, "signal");
+ break;
+
+ case 21:
+ manager_override_show_status(m, SHOW_STATUS_NO, "signal");
+ break;
+
+ case 22:
+ manager_override_log_level(m, LOG_DEBUG);
+ break;
+
+ case 23:
+ manager_restore_original_log_level(m);
+ break;
+
+ case 24:
+ if (MANAGER_IS_USER(m)) {
+ m->objective = MANAGER_EXIT;
+ return 0;
+ }
+
+ /* This is a nop on init */
+ break;
+
+ case 25:
+ m->objective = MANAGER_REEXECUTE;
+ break;
+
+ case 26:
+ case 29: /* compatibility: used to be mapped to LOG_TARGET_SYSLOG_OR_KMSG */
+ manager_restore_original_log_target(m);
+ break;
+
+ case 27:
+ manager_override_log_target(m, LOG_TARGET_CONSOLE);
+ break;
+
+ case 28:
+ manager_override_log_target(m, LOG_TARGET_KMSG);
+ break;
+
+ default:
+ log_warning("Got unhandled signal <%s>.", signal_to_string(sfsi.ssi_signo));
+ }
+ }}
+
+ return 0;
+}
+
+static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Unit *u;
+
+ log_struct(LOG_DEBUG,
+ "MESSAGE_ID=" SD_MESSAGE_TIME_CHANGE_STR,
+ LOG_MESSAGE("Time has been changed"));
+
+ /* Restart the watch */
+ (void) manager_setup_time_change(m);
+
+ HASHMAP_FOREACH(u, m->units)
+ if (UNIT_VTABLE(u)->time_change)
+ UNIT_VTABLE(u)->time_change(u);
+
+ return 0;
+}
+
+static int manager_dispatch_timezone_change(
+ sd_event_source *source,
+ const struct inotify_event *e,
+ void *userdata) {
+
+ Manager *m = ASSERT_PTR(userdata);
+ int changed;
+ Unit *u;
+
+ log_debug("inotify event for /etc/localtime");
+
+ changed = manager_read_timezone_stat(m);
+ if (changed <= 0)
+ return changed;
+
+ /* Something changed, restart the watch, to ensure we watch the new /etc/localtime if it changed */
+ (void) manager_setup_timezone_change(m);
+
+ /* Read the new timezone */
+ tzset();
+
+ log_debug("Timezone has been changed (now: %s).", tzname[daylight]);
+
+ HASHMAP_FOREACH(u, m->units)
+ if (UNIT_VTABLE(u)->timezone_change)
+ UNIT_VTABLE(u)->timezone_change(u);
+
+ return 0;
+}
+
+static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(m->idle_pipe[2] == fd);
+
+ /* There's at least one Type=idle child that just gave up on us waiting for the boot process to
+ * complete. Let's now turn off any further console output if there's at least one service that needs
+ * console access, so that from now on our own output should not spill into that service's output
+ * anymore. After all, we support Type=idle only to beautify console output and it generally is set
+ * on services that want to own the console exclusively without our interference. */
+ m->no_console_output = m->n_on_console > 0;
+
+ /* Acknowledge the child's request, and let all other children know too that they shouldn't wait
+ * any longer by closing the pipes towards them, which is what they are waiting for. */
+ manager_close_idle_pipe(m);
+
+ return 0;
+}
+
+static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ int r;
+
+ assert(source);
+
+ manager_print_jobs_in_progress(m);
+
+ r = sd_event_source_set_time_relative(source, JOBS_IN_PROGRESS_PERIOD_USEC);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(source, SD_EVENT_ONESHOT);
+}
+
+int manager_loop(Manager *m) {
+ RateLimit rl = { .interval = 1*USEC_PER_SEC, .burst = 50000 };
+ int r;
+
+ assert(m);
+ assert(m->objective == MANAGER_OK); /* Ensure manager_startup() has been called */
+
+ manager_check_finished(m);
+
+ /* There might still be some zombies hanging around from before we were exec()'ed. Let's reap them. */
+ r = sd_event_source_set_enabled(m->sigchld_event_source, SD_EVENT_ON);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable SIGCHLD event source: %m");
+
+ while (m->objective == MANAGER_OK) {
+
+ (void) watchdog_ping();
+
+ if (!ratelimit_below(&rl)) {
+ /* Yay, something is going seriously wrong, pause a little */
+ log_warning("Looping too fast. Throttling execution a little.");
+ sleep(1);
+ }
+
+ if (manager_dispatch_load_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_gc_job_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_gc_unit_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_cleanup_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_cgroup_realize_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_start_when_upheld_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_stop_when_bound_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_stop_when_unneeded_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_release_resources_queue(m) > 0)
+ continue;
+
+ if (manager_dispatch_dbus_queue(m) > 0)
+ continue;
+
+ /* Sleep for watchdog runtime wait time */
+ r = sd_event_run(m->event, watchdog_runtime_wait());
+ if (r < 0)
+ return log_error_errno(r, "Failed to run event loop: %m");
+ }
+
+ return m->objective;
+}
+
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) {
+ _cleanup_free_ char *n = NULL;
+ sd_id128_t invocation_id;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(s);
+ assert(_u);
+
+ r = unit_name_from_dbus_path(s, &n);
+ if (r < 0)
+ return r;
+
+ /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128-bit ID then
+ * we use it as invocation ID. */
+ r = sd_id128_from_string(n, &invocation_id);
+ if (r >= 0) {
+ u = hashmap_get(m->units_by_invocation_id, &invocation_id);
+ if (u) {
+ *_u = u;
+ return 0;
+ }
+
+ return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID,
+ "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.",
+ SD_ID128_FORMAT_VAL(invocation_id));
+ }
+
+ /* If this didn't work, we check if this is a unit name */
+ if (!unit_name_is_valid(n, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE)) {
+ _cleanup_free_ char *nn = NULL;
+
+ nn = cescape(n);
+ return sd_bus_error_setf(e, SD_BUS_ERROR_INVALID_ARGS,
+ "Unit name %s is neither a valid invocation ID nor unit name.", strnull(nn));
+ }
+
+ r = manager_load_unit(m, n, NULL, e, &u);
+ if (r < 0)
+ return r;
+
+ *_u = u;
+ return 0;
+}
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j) {
+ const char *p;
+ unsigned id;
+ Job *j;
+ int r;
+
+ assert(m);
+ assert(s);
+ assert(_j);
+
+ p = startswith(s, "/org/freedesktop/systemd1/job/");
+ if (!p)
+ return -EINVAL;
+
+ r = safe_atou(p, &id);
+ if (r < 0)
+ return r;
+
+ j = manager_get_job(m, id);
+ if (!j)
+ return -ENOENT;
+
+ *_j = j;
+
+ return 0;
+}
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success) {
+
+#if HAVE_AUDIT
+ _cleanup_free_ char *p = NULL;
+ const char *msg;
+ int audit_fd, r;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ audit_fd = get_audit_fd();
+ if (audit_fd < 0)
+ return;
+
+ /* Don't generate audit events if the service was already
+ * started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ r = unit_name_to_prefix_and_instance(u->id, &p);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to extract prefix and instance of unit name, ignoring: %m");
+ return;
+ }
+
+ msg = strjoina("unit=", p);
+ if (audit_log_user_comm_message(audit_fd, type, msg, "systemd", NULL, NULL, NULL, success) < 0) {
+ if (ERRNO_IS_PRIVILEGE(errno)) {
+ /* We aren't allowed to send audit messages? Then let's not retry again. */
+ log_debug_errno(errno, "Failed to send audit message, closing audit socket: %m");
+ close_audit_fd();
+ } else
+ log_warning_errno(errno, "Failed to send audit message, ignoring: %m");
+ }
+#endif
+
+}
+
+void manager_send_unit_plymouth(Manager *m, Unit *u) {
+ _cleanup_free_ char *message = NULL;
+ int c, r;
+
+ /* Don't generate plymouth events if the service was already
+ * started and we're just deserializing */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (detect_container() > 0)
+ return;
+
+ if (!UNIT_VTABLE(u)->notify_plymouth)
+ return;
+
+ c = asprintf(&message, "U\x02%c%s%c", (int) (strlen(u->id) + 1), u->id, '\x00');
+ if (c < 0)
+ return (void) log_oom();
+
+ /* We set SOCK_NONBLOCK here so that we rather drop the message then wait for plymouth */
+ r = plymouth_send_raw(message, c, SOCK_NONBLOCK);
+ if (r < 0)
+ log_full_errno(ERRNO_IS_NO_PLYMOUTH(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to communicate with plymouth: %m");
+}
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t) {
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return USEC_INFINITY;
+
+ if (m->watchdog_overridden[t] != USEC_INFINITY)
+ return m->watchdog_overridden[t];
+
+ return m->watchdog[t];
+}
+
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (m->watchdog[t] == timeout)
+ return;
+
+ if (m->watchdog_overridden[t] == USEC_INFINITY) {
+ if (t == WATCHDOG_RUNTIME)
+ (void) watchdog_setup(timeout);
+ else if (t == WATCHDOG_PRETIMEOUT)
+ (void) watchdog_setup_pretimeout(timeout);
+ }
+
+ m->watchdog[t] = timeout;
+}
+
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
+ usec_t usec;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (m->watchdog_overridden[t] == timeout)
+ return;
+
+ usec = timeout == USEC_INFINITY ? m->watchdog[t] : timeout;
+ if (t == WATCHDOG_RUNTIME)
+ (void) watchdog_setup(usec);
+ else if (t == WATCHDOG_PRETIMEOUT)
+ (void) watchdog_setup_pretimeout(usec);
+
+ m->watchdog_overridden[t] = timeout;
+}
+
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return 0;
+
+ if (streq_ptr(m->watchdog_pretimeout_governor, governor))
+ return 0;
+
+ p = strdup(governor);
+ if (!p)
+ return -ENOMEM;
+
+ r = watchdog_setup_pretimeout_governor(governor);
+ if (r < 0)
+ return r;
+
+ return free_and_replace(m->watchdog_pretimeout_governor, p);
+}
+
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return 0;
+
+ if (streq_ptr(m->watchdog_pretimeout_governor_overridden, governor))
+ return 0;
+
+ p = strdup(governor);
+ if (!p)
+ return -ENOMEM;
+
+ r = watchdog_setup_pretimeout_governor(governor);
+ if (r < 0)
+ return r;
+
+ return free_and_replace(m->watchdog_pretimeout_governor_overridden, p);
+}
+
+int manager_reload(Manager *m) {
+ _unused_ _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(m);
+
+ r = manager_open_serialization(m, &f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create serialization file: %m");
+
+ fds = fdset_new();
+ if (!fds)
+ return log_oom();
+
+ /* We are officially in reload mode from here on. */
+ reloading = manager_reloading_start(m);
+
+ r = manager_serialize(m, f, fds, false);
+ if (r < 0)
+ return r;
+
+ if (fseeko(f, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to seek to beginning of serialization: %m");
+
+ /* 💀 This is the point of no return, from here on there is no way back. 💀 */
+ reloading = NULL;
+
+ bus_manager_send_reloading(m, true);
+
+ /* Start by flushing out all jobs and units, all generated units, all runtime environments, all dynamic users
+ * and everything else that is worth flushing out. We'll get it all back from the serialization — if we need
+ * it. */
+
+ manager_clear_jobs_and_units(m);
+ lookup_paths_flush_generator(&m->lookup_paths);
+ lookup_paths_free(&m->lookup_paths);
+ exec_shared_runtime_vacuum(m);
+ dynamic_user_vacuum(m, false);
+ m->uid_refs = hashmap_free(m->uid_refs);
+ m->gid_refs = hashmap_free(m->gid_refs);
+
+ r = lookup_paths_init_or_warn(&m->lookup_paths, m->runtime_scope, 0, NULL);
+ if (r < 0)
+ return r;
+
+ (void) manager_run_environment_generators(m);
+ (void) manager_run_generators(m);
+
+ lookup_paths_log(&m->lookup_paths);
+
+ /* We flushed out generated files, for which we don't watch mtime, so we should flush the old map. */
+ manager_free_unit_name_maps(m);
+ m->unit_file_state_outdated = false;
+
+ /* First, enumerate what we can from kernel and suchlike */
+ manager_enumerate_perpetual(m);
+ manager_enumerate(m);
+
+ /* Second, deserialize our stored data */
+ r = manager_deserialize(m, f, fds);
+ if (r < 0)
+ log_warning_errno(r, "Deserialization failed, proceeding anyway: %m");
+
+ /* We don't need the serialization anymore */
+ f = safe_fclose(f);
+
+ /* Re-register notify_fd as event source, and set up other sockets/communication channels we might need */
+ (void) manager_setup_notify(m);
+ (void) manager_setup_cgroups_agent(m);
+ (void) manager_setup_user_lookup_fd(m);
+
+ /* Third, fire things up! */
+ manager_coldplug(m);
+
+ /* Clean up runtime objects no longer referenced */
+ manager_vacuum(m);
+
+ /* Clean up deserialized tracked clients */
+ m->deserialized_subscribed = strv_free(m->deserialized_subscribed);
+
+ /* Consider the reload process complete now. */
+ assert(m->n_reloading > 0);
+ m->n_reloading--;
+
+ manager_ready(m);
+
+ m->send_reloading_done = true;
+ return 0;
+}
+
+void manager_reset_failed(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ HASHMAP_FOREACH(u, m->units)
+ unit_reset_failed(u);
+}
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name) {
+ Unit *u;
+
+ assert(m);
+ assert(name);
+
+ /* Returns true if the unit is inactive or going down */
+ u = manager_get_unit(m, name);
+ if (!u)
+ return true;
+
+ return unit_inactive_or_pending(u);
+}
+
+static void log_taint_string(Manager *m) {
+ _cleanup_free_ char *taint = NULL;
+
+ assert(m);
+
+ if (MANAGER_IS_USER(m) || m->taint_logged)
+ return;
+
+ m->taint_logged = true; /* only check for taint once */
+
+ taint = manager_taint_string(m);
+ if (isempty(taint))
+ return;
+
+ log_struct(LOG_NOTICE,
+ LOG_MESSAGE("System is tainted: %s", taint),
+ "TAINT=%s", taint,
+ "MESSAGE_ID=" SD_MESSAGE_TAINTED_STR);
+}
+
+static void manager_notify_finished(Manager *m) {
+ usec_t firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec;
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return;
+
+ if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) {
+ char buf[FORMAT_TIMESPAN_MAX + STRLEN(" (firmware) + ") + FORMAT_TIMESPAN_MAX + STRLEN(" (loader) + ")]
+ = {};
+ char *p = buf;
+ size_t size = sizeof buf;
+
+ /* Note that MANAGER_TIMESTAMP_KERNEL's monotonic value is always at 0, and
+ * MANAGER_TIMESTAMP_FIRMWARE's and MANAGER_TIMESTAMP_LOADER's monotonic value should be considered
+ * negative values. */
+
+ firmware_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic - m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic;
+ loader_usec = m->timestamps[MANAGER_TIMESTAMP_LOADER].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+ total_usec = m->timestamps[MANAGER_TIMESTAMP_FIRMWARE].monotonic + m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic;
+
+ if (firmware_usec > 0)
+ size = strpcpyf(&p, size, "%s (firmware) + ", FORMAT_TIMESPAN(firmware_usec, USEC_PER_MSEC));
+ if (loader_usec > 0)
+ size = strpcpyf(&p, size, "%s (loader) + ", FORMAT_TIMESPAN(loader_usec, USEC_PER_MSEC));
+
+ if (dual_timestamp_is_set(&m->timestamps[MANAGER_TIMESTAMP_INITRD])) {
+
+ /* The initrd case on bare-metal */
+ kernel_usec = m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ initrd_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_INITRD].monotonic;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+ "KERNEL_USEC="USEC_FMT, kernel_usec,
+ "INITRD_USEC="USEC_FMT, initrd_usec,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (initrd) + %s (userspace) = %s.",
+ buf,
+ FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(initrd_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ } else {
+ /* The initrd-less case on bare-metal */
+
+ kernel_usec = m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic - m->timestamps[MANAGER_TIMESTAMP_KERNEL].monotonic;
+ initrd_usec = 0;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_STARTUP_FINISHED_STR,
+ "KERNEL_USEC="USEC_FMT, kernel_usec,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s%s (kernel) + %s (userspace) = %s.",
+ buf,
+ FORMAT_TIMESPAN(kernel_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(userspace_usec, USEC_PER_MSEC),
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ }
+ } else {
+ /* The container and --user case */
+ firmware_usec = loader_usec = initrd_usec = kernel_usec = 0;
+ total_usec = userspace_usec = m->timestamps[MANAGER_TIMESTAMP_FINISH].monotonic - m->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+
+ log_struct(LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_USER_STARTUP_FINISHED_STR,
+ "USERSPACE_USEC="USEC_FMT, userspace_usec,
+ LOG_MESSAGE("Startup finished in %s.",
+ FORMAT_TIMESPAN(total_usec, USEC_PER_MSEC)));
+ }
+
+ bus_manager_send_finished(m, firmware_usec, loader_usec, kernel_usec, initrd_usec, userspace_usec, total_usec);
+
+ log_taint_string(m);
+}
+
+static void user_manager_send_ready(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* We send READY=1 on reaching basic.target only when running in --user mode. */
+ if (!MANAGER_IS_USER(m) || m->ready_sent)
+ return;
+
+ r = sd_notify(false,
+ "READY=1\n"
+ "STATUS=Reached " SPECIAL_BASIC_TARGET ".");
+ if (r < 0)
+ log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+
+ m->ready_sent = true;
+ m->status_ready = false;
+}
+
+static void manager_send_ready(Manager *m) {
+ int r;
+
+ if (m->ready_sent && m->status_ready)
+ /* Skip the notification if nothing changed. */
+ return;
+
+ r = sd_notify(false,
+ "READY=1\n"
+ "STATUS=Ready.");
+ if (r < 0)
+ log_full_errno(m->ready_sent ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to send readiness notification, ignoring: %m");
+
+ m->ready_sent = m->status_ready = true;
+}
+
+static void manager_check_basic_target(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ /* Small shortcut */
+ if (m->ready_sent && m->taint_logged)
+ return;
+
+ u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+ if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ /* For user managers, send out READY=1 as soon as we reach basic.target */
+ user_manager_send_ready(m);
+
+ /* Log the taint string as soon as we reach basic.target */
+ log_taint_string(m);
+}
+
+void manager_check_finished(Manager *m) {
+ assert(m);
+
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ /* Verify that we have entered the event loop already, and not left it again. */
+ if (!MANAGER_IS_RUNNING(m))
+ return;
+
+ manager_check_basic_target(m);
+
+ if (hashmap_size(m->jobs) > 0) {
+ if (m->jobs_in_progress_event_source)
+ /* Ignore any failure, this is only for feedback */
+ (void) sd_event_source_set_time(m->jobs_in_progress_event_source,
+ manager_watch_jobs_next_time(m));
+ return;
+ }
+
+ /* The jobs hashmap tends to grow a lot during boot, and then it's not reused until shutdown. Let's
+ kill the hashmap if it is relatively large. */
+ if (hashmap_buckets(m->jobs) > hashmap_size(m->units) / 10)
+ m->jobs = hashmap_free(m->jobs);
+
+ manager_send_ready(m);
+
+ /* Notify Type=idle units that we are done now */
+ manager_close_idle_pipe(m);
+
+ if (MANAGER_IS_FINISHED(m))
+ return;
+
+ manager_flip_auto_status(m, false, "boot finished");
+
+ /* Turn off confirm spawn now */
+ m->confirm_spawn = NULL;
+
+ /* No need to update ask password status when we're going non-interactive */
+ manager_close_ask_password(m);
+
+ /* This is no longer the first boot */
+ manager_set_first_boot(m, false);
+
+ dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_FINISH);
+
+ manager_notify_finished(m);
+
+ manager_invalidate_startup_units(m);
+}
+
+void manager_send_reloading(Manager *m) {
+ assert(m);
+
+ /* Let whoever invoked us know that we are now reloading */
+ (void) sd_notifyf(/* unset= */ false,
+ "RELOADING=1\n"
+ "MONOTONIC_USEC=" USEC_FMT "\n", now(CLOCK_MONOTONIC));
+
+ /* And ensure that we'll send READY=1 again as soon as we are ready again */
+ m->ready_sent = false;
+}
+
+static bool generator_path_any(const char* const* paths) {
+ bool found = false;
+
+ /* Optimize by skipping the whole process by not creating output directories
+ * if no generators are found. */
+ STRV_FOREACH(path, paths)
+ if (access(*path, F_OK) == 0)
+ found = true;
+ else if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open generator directory %s: %m", *path);
+
+ return found;
+}
+
+static int manager_run_environment_generators(Manager *m) {
+ char **tmp = NULL; /* this is only used in the forked process, no cleanup here */
+ _cleanup_strv_free_ char **paths = NULL;
+ void* args[] = {
+ [STDOUT_GENERATE] = &tmp,
+ [STDOUT_COLLECT] = &tmp,
+ [STDOUT_CONSUME] = &m->transient_environment,
+ };
+ int r;
+
+ if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_ENV_GENERATORS))
+ return 0;
+
+ paths = env_generator_binary_paths(m->runtime_scope);
+ if (!paths)
+ return log_oom();
+
+ if (!generator_path_any((const char* const*) paths))
+ return 0;
+
+ WITH_UMASK(0022)
+ r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment,
+ args, NULL, m->transient_environment,
+ EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+ return r;
+}
+
+static int build_generator_environment(Manager *m, char ***ret) {
+ _cleanup_strv_free_ char **nl = NULL;
+ Virtualization v;
+ ConfidentialVirtualization cv;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ /* Generators oftentimes want to know some basic facts about the environment they run in, in order to
+ * adjust generated units to that. Let's pass down some bits of information that are easy for us to
+ * determine (but a bit harder for generator scripts to determine), as environment variables. */
+
+ nl = strv_copy(m->transient_environment);
+ if (!nl)
+ return -ENOMEM;
+
+ r = strv_env_assign(&nl, "SYSTEMD_SCOPE", runtime_scope_to_string(m->runtime_scope));
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* Note that $SYSTEMD_IN_INITRD may be used to override the initrd detection in much of our
+ * codebase. This is hence more than purely informational. It will shortcut detection of the
+ * initrd state if generators invoke our own tools. But that's OK, as it would come to the
+ * same results (hopefully). */
+ r = strv_env_assign(&nl, "SYSTEMD_IN_INITRD", one_zero(in_initrd()));
+ if (r < 0)
+ return r;
+
+ if (m->first_boot >= 0) {
+ r = strv_env_assign(&nl, "SYSTEMD_FIRST_BOOT", one_zero(m->first_boot));
+ if (r < 0)
+ return r;
+ }
+ }
+
+ v = detect_virtualization();
+ if (v < 0)
+ log_debug_errno(v, "Failed to detect virtualization, ignoring: %m");
+ else if (v > 0) {
+ const char *s;
+
+ s = strjoina(VIRTUALIZATION_IS_VM(v) ? "vm:" :
+ VIRTUALIZATION_IS_CONTAINER(v) ? "container:" : ":",
+ virtualization_to_string(v));
+
+ r = strv_env_assign(&nl, "SYSTEMD_VIRTUALIZATION", s);
+ if (r < 0)
+ return r;
+ }
+
+ cv = detect_confidential_virtualization();
+ if (cv < 0)
+ log_debug_errno(cv, "Failed to detect confidential virtualization, ignoring: %m");
+ else if (cv > 0) {
+ r = strv_env_assign(&nl, "SYSTEMD_CONFIDENTIAL_VIRTUALIZATION", confidential_virtualization_to_string(cv));
+ if (r < 0)
+ return r;
+ }
+
+ r = strv_env_assign(&nl, "SYSTEMD_ARCHITECTURE", architecture_to_string(uname_architecture()));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(nl);
+ return 0;
+}
+
+static int manager_execute_generators(Manager *m, char **paths, bool remount_ro) {
+ _cleanup_strv_free_ char **ge = NULL;
+ const char *argv[] = {
+ NULL, /* Leave this empty, execute_directory() will fill something in */
+ m->lookup_paths.generator,
+ m->lookup_paths.generator_early,
+ m->lookup_paths.generator_late,
+ NULL,
+ };
+ int r;
+
+ r = build_generator_environment(m, &ge);
+ if (r < 0)
+ return log_error_errno(r, "Failed to build generator environment: %m");
+
+ if (remount_ro) {
+ /* Remount most of the filesystem tree read-only. We leave /sys/ as-is, because our code
+ * checks whether it is read-only to detect containerized execution environments. We leave
+ * /run/ as-is too, because that's where our output goes. We also leave /proc/ and /dev/shm/
+ * because they're API, and /tmp/ that safe_fork() mounted for us.
+ */
+ r = bind_remount_recursive("/", MS_RDONLY, MS_RDONLY,
+ STRV_MAKE("/sys", "/run", "/proc", "/dev/shm", "/tmp"));
+ if (r < 0)
+ log_warning_errno(r, "Read-only bind remount failed, ignoring: %m");
+ }
+
+ BLOCK_WITH_UMASK(0022);
+ return execute_directories(
+ (const char* const*) paths,
+ DEFAULT_TIMEOUT_USEC,
+ /* callbacks= */ NULL, /* callback_args= */ NULL,
+ (char**) argv,
+ ge,
+ EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
+}
+
+static int manager_run_generators(Manager *m) {
+ ForkFlags flags = FORK_RESET_SIGNALS | FORK_WAIT | FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE;
+ _cleanup_strv_free_ char **paths = NULL;
+ int r;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m) && !(m->test_run_flags & MANAGER_TEST_RUN_GENERATORS))
+ return 0;
+
+ paths = generator_binary_paths(m->runtime_scope);
+ if (!paths)
+ return log_oom();
+
+ if (!generator_path_any((const char* const*) paths))
+ return 0;
+
+ r = lookup_paths_mkdir_generator(&m->lookup_paths);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create generator directories: %m");
+ goto finish;
+ }
+
+ /* If we are the system manager, we fork and invoke the generators in a sanitized mount namespace. If
+ * we are the user manager, let's just execute the generators directly. We might not have the
+ * necessary privileges, and the system manager has already mounted /tmp/ and everything else for us.
+ */
+ if (MANAGER_IS_USER(m)) {
+ r = manager_execute_generators(m, paths, /* remount_ro= */ false);
+ goto finish;
+ }
+
+ /* On some systems /tmp/ doesn't exist, and on some other systems we cannot create it at all. Avoid
+ * trying to mount a private tmpfs on it as there's no one size fits all. */
+ if (is_dir("/tmp", /* follow= */ false) > 0)
+ flags |= FORK_PRIVATE_TMP;
+
+ r = safe_fork("(sd-gens)", flags, NULL);
+ if (r == 0) {
+ r = manager_execute_generators(m, paths, /* remount_ro= */ true);
+ _exit(r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+ }
+ if (r < 0) {
+ if (!ERRNO_IS_PRIVILEGE(r) && r != -EINVAL) {
+ log_error_errno(r, "Failed to fork off sandboxing environment for executing generators: %m");
+ goto finish;
+ }
+
+ /* Failed to fork with new mount namespace? Maybe, running in a container environment with
+ * seccomp or without capability.
+ *
+ * We also allow -EINVAL to allow running without CLONE_NEWNS.
+ *
+ * Also, when running on non-native userland architecture via systemd-nspawn and
+ * qemu-user-static QEMU-emulator, clone() with CLONE_NEWNS fails with EINVAL, see
+ * https://github.com/systemd/systemd/issues/28901.
+ */
+ log_debug_errno(r,
+ "Failed to fork off sandboxing environment for executing generators. "
+ "Falling back to execute generators without sandboxing: %m");
+ r = manager_execute_generators(m, paths, /* remount_ro= */ false);
+ }
+
+finish:
+ lookup_paths_trim_generator(&m->lookup_paths);
+ return r;
+}
+
+int manager_transient_environment_add(Manager *m, char **plus) {
+ char **a;
+
+ assert(m);
+
+ if (strv_isempty(plus))
+ return 0;
+
+ a = strv_env_merge(m->transient_environment, plus);
+ if (!a)
+ return log_oom();
+
+ sanitize_environment(a);
+
+ return strv_free_and_replace(m->transient_environment, a);
+}
+
+int manager_client_environment_modify(
+ Manager *m,
+ char **minus,
+ char **plus) {
+
+ char **a = NULL, **b = NULL, **l;
+
+ assert(m);
+
+ if (strv_isempty(minus) && strv_isempty(plus))
+ return 0;
+
+ l = m->client_environment;
+
+ if (!strv_isempty(minus)) {
+ a = strv_env_delete(l, 1, minus);
+ if (!a)
+ return -ENOMEM;
+
+ l = a;
+ }
+
+ if (!strv_isempty(plus)) {
+ b = strv_env_merge(l, plus);
+ if (!b) {
+ strv_free(a);
+ return -ENOMEM;
+ }
+
+ l = b;
+ }
+
+ if (m->client_environment != l)
+ strv_free(m->client_environment);
+
+ if (a != l)
+ strv_free(a);
+ if (b != l)
+ strv_free(b);
+
+ m->client_environment = sanitize_environment(l);
+ return 0;
+}
+
+int manager_get_effective_environment(Manager *m, char ***ret) {
+ char **l;
+
+ assert(m);
+ assert(ret);
+
+ l = strv_env_merge(m->transient_environment, m->client_environment);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ return 0;
+}
+
+int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults) {
+ _cleanup_free_ char *label = NULL;
+ struct rlimit *rlimit[_RLIMIT_MAX];
+ int r;
+
+ assert(m);
+ assert(defaults);
+
+ if (streq_ptr(defaults->smack_process_label, "/"))
+ label = NULL;
+ else {
+ const char *l = defaults->smack_process_label;
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+ if (!l)
+ l = SMACK_DEFAULT_PROCESS_LABEL;
+#endif
+ if (l) {
+ label = strdup(l);
+ if (!label)
+ return -ENOMEM;
+ } else
+ label = NULL;
+ }
+
+ r = rlimit_copy_all(rlimit, defaults->rlimit);
+ if (r < 0)
+ return r;
+
+ m->defaults.std_output = defaults->std_output;
+ m->defaults.std_error = defaults->std_error;
+
+ m->defaults.restart_usec = defaults->restart_usec;
+ m->defaults.timeout_start_usec = defaults->timeout_start_usec;
+ m->defaults.timeout_stop_usec = defaults->timeout_stop_usec;
+ m->defaults.timeout_abort_usec = defaults->timeout_abort_usec;
+ m->defaults.timeout_abort_set = defaults->timeout_abort_set;
+ m->defaults.device_timeout_usec = defaults->device_timeout_usec;
+
+ m->defaults.start_limit_interval = defaults->start_limit_interval;
+ m->defaults.start_limit_burst = defaults->start_limit_burst;
+
+ m->defaults.cpu_accounting = defaults->cpu_accounting;
+ m->defaults.memory_accounting = defaults->memory_accounting;
+ m->defaults.io_accounting = defaults->io_accounting;
+ m->defaults.blockio_accounting = defaults->blockio_accounting;
+ m->defaults.tasks_accounting = defaults->tasks_accounting;
+ m->defaults.ip_accounting = defaults->ip_accounting;
+
+ m->defaults.tasks_max = defaults->tasks_max;
+ m->defaults.timer_accuracy_usec = defaults->timer_accuracy_usec;
+
+ m->defaults.oom_policy = defaults->oom_policy;
+ m->defaults.oom_score_adjust = defaults->oom_score_adjust;
+ m->defaults.oom_score_adjust_set = defaults->oom_score_adjust_set;
+
+ m->defaults.memory_pressure_watch = defaults->memory_pressure_watch;
+ m->defaults.memory_pressure_threshold_usec = defaults->memory_pressure_threshold_usec;
+
+ free_and_replace(m->defaults.smack_process_label, label);
+ rlimit_free_all(m->defaults.rlimit);
+ memcpy(m->defaults.rlimit, rlimit, sizeof(struct rlimit*) * _RLIMIT_MAX);
+
+ return 0;
+}
+
+void manager_recheck_dbus(Manager *m) {
+ assert(m);
+
+ /* Connects to the bus if the dbus service and socket are running. If we are running in user mode
+ * this is all it does. In system mode we'll also connect to the system bus (which will most likely
+ * just reuse the connection of the API bus). That's because the system bus after all runs as service
+ * of the system instance, while in the user instance we can assume it's already there. */
+
+ if (MANAGER_IS_RELOADING(m))
+ return; /* don't check while we are reloading… */
+
+ if (manager_dbus_is_running(m, false)) {
+ (void) bus_init_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_init_system(m);
+ } else {
+ (void) bus_done_api(m);
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) bus_done_system(m);
+ }
+}
+
+static bool manager_journal_is_running(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ if (MANAGER_IS_TEST_RUN(m))
+ return false;
+
+ /* If we are the user manager we can safely assume that the journal is up */
+ if (!MANAGER_IS_SYSTEM(m))
+ return true;
+
+ /* Check that the socket is not only up, but in RUNNING state */
+ u = manager_get_unit(m, SPECIAL_JOURNALD_SOCKET);
+ if (!u)
+ return false;
+ if (SOCKET(u)->state != SOCKET_RUNNING)
+ return false;
+
+ /* Similar, check if the daemon itself is fully up, too */
+ u = manager_get_unit(m, SPECIAL_JOURNALD_SERVICE);
+ if (!u)
+ return false;
+ if (!IN_SET(SERVICE(u)->state, SERVICE_RELOAD, SERVICE_RUNNING))
+ return false;
+
+ return true;
+}
+
+void disable_printk_ratelimit(void) {
+ /* Disable kernel's printk ratelimit.
+ *
+ * Logging to /dev/kmsg is most useful during early boot and shutdown, where normal logging
+ * mechanisms are not available. The semantics of this sysctl are such that any kernel command-line
+ * setting takes precedence. */
+ int r;
+
+ r = sysctl_write("kernel/printk_devkmsg", "on");
+ if (r < 0)
+ log_debug_errno(r, "Failed to set sysctl kernel.printk_devkmsg=on: %m");
+}
+
+void manager_recheck_journal(Manager *m) {
+
+ assert(m);
+
+ /* Don't bother with this unless we are in the special situation of being PID 1 */
+ if (getpid_cached() != 1)
+ return;
+
+ /* Don't check this while we are reloading, things might still change */
+ if (MANAGER_IS_RELOADING(m))
+ return;
+
+ /* The journal is fully and entirely up? If so, let's permit logging to it, if that's configured. If
+ * the journal is down, don't ever log to it, otherwise we might end up deadlocking ourselves as we
+ * might trigger an activation ourselves we can't fulfill. */
+ log_set_prohibit_ipc(!manager_journal_is_running(m));
+ log_open();
+}
+
+static ShowStatus manager_get_show_status(Manager *m) {
+ assert(m);
+
+ if (MANAGER_IS_USER(m))
+ return _SHOW_STATUS_INVALID;
+
+ if (m->show_status_overridden != _SHOW_STATUS_INVALID)
+ return m->show_status_overridden;
+
+ return m->show_status;
+}
+
+bool manager_get_show_status_on(Manager *m) {
+ assert(m);
+
+ return show_status_on(manager_get_show_status(m));
+}
+
+static void set_show_status_marker(bool b) {
+ if (b)
+ (void) touch("/run/systemd/show-status");
+ else
+ (void) unlink("/run/systemd/show-status");
+}
+
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason) {
+ assert(m);
+ assert(reason);
+ assert(mode >= 0 && mode < _SHOW_STATUS_MAX);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (mode == m->show_status)
+ return;
+
+ if (m->show_status_overridden == _SHOW_STATUS_INVALID) {
+ bool enabled;
+
+ enabled = show_status_on(mode);
+ log_debug("%s (%s) showing of status (%s).",
+ enabled ? "Enabling" : "Disabling",
+ strna(show_status_to_string(mode)),
+ reason);
+
+ set_show_status_marker(enabled);
+ }
+
+ m->show_status = mode;
+}
+
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason) {
+ assert(m);
+ assert(mode < _SHOW_STATUS_MAX);
+
+ if (MANAGER_IS_USER(m))
+ return;
+
+ if (mode == m->show_status_overridden)
+ return;
+
+ m->show_status_overridden = mode;
+
+ if (mode == _SHOW_STATUS_INVALID)
+ mode = m->show_status;
+
+ log_debug("%s (%s) showing of status (%s).",
+ m->show_status_overridden != _SHOW_STATUS_INVALID ? "Overriding" : "Restoring",
+ strna(show_status_to_string(mode)),
+ reason);
+
+ set_show_status_marker(show_status_on(mode));
+}
+
+const char *manager_get_confirm_spawn(Manager *m) {
+ static int last_errno = 0;
+ struct stat st;
+ int r;
+
+ assert(m);
+
+ /* Here's the deal: we want to test the validity of the console but don't want
+ * PID1 to go through the whole console process which might block. But we also
+ * want to warn the user only once if something is wrong with the console so we
+ * cannot do the sanity checks after spawning our children. So here we simply do
+ * really basic tests to hopefully trap common errors.
+ *
+ * If the console suddenly disappear at the time our children will really it
+ * then they will simply fail to acquire it and a positive answer will be
+ * assumed. New children will fall back to /dev/console though.
+ *
+ * Note: TTYs are devices that can come and go any time, and frequently aren't
+ * available yet during early boot (consider a USB rs232 dongle...). If for any
+ * reason the configured console is not ready, we fall back to the default
+ * console. */
+
+ if (!m->confirm_spawn || path_equal(m->confirm_spawn, "/dev/console"))
+ return m->confirm_spawn;
+
+ if (stat(m->confirm_spawn, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (!S_ISCHR(st.st_mode)) {
+ r = -ENOTTY;
+ goto fail;
+ }
+
+ last_errno = 0;
+ return m->confirm_spawn;
+
+fail:
+ if (last_errno != r)
+ last_errno = log_warning_errno(r, "Failed to open %s, using default console: %m", m->confirm_spawn);
+
+ return "/dev/console";
+}
+
+void manager_set_first_boot(Manager *m, bool b) {
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return;
+
+ if (m->first_boot != (int) b) {
+ if (b)
+ (void) touch("/run/systemd/first-boot");
+ else
+ (void) unlink("/run/systemd/first-boot");
+ }
+
+ m->first_boot = b;
+}
+
+void manager_disable_confirm_spawn(void) {
+ (void) touch("/run/systemd/confirm_spawn_disabled");
+}
+
+static bool manager_should_show_status(Manager *m, StatusType type) {
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m))
+ return false;
+
+ if (m->no_console_output)
+ return false;
+
+ if (!IN_SET(manager_state(m), MANAGER_INITIALIZING, MANAGER_STARTING, MANAGER_STOPPING))
+ return false;
+
+ /* If we cannot find out the status properly, just proceed. */
+ if (type != STATUS_TYPE_EMERGENCY && manager_check_ask_password(m) > 0)
+ return false;
+
+ if (type == STATUS_TYPE_NOTICE && m->show_status != SHOW_STATUS_NO)
+ return true;
+
+ return manager_get_show_status_on(m);
+}
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) {
+ va_list ap;
+
+ /* If m is NULL, assume we're after shutdown and let the messages through. */
+
+ if (m && !manager_should_show_status(m, type))
+ return;
+
+ /* XXX We should totally drop the check for ephemeral here
+ * and thus effectively make 'Type=idle' pointless. */
+ if (type == STATUS_TYPE_EPHEMERAL && m && m->n_on_console > 0)
+ return;
+
+ va_start(ap, format);
+ status_vprintf(status, SHOW_STATUS_ELLIPSIZE|(type == STATUS_TYPE_EPHEMERAL ? SHOW_STATUS_EPHEMERAL : 0), format, ap);
+ va_end(ap);
+}
+
+Set* manager_get_units_requiring_mounts_for(Manager *m, const char *path) {
+ assert(m);
+ assert(path);
+
+ if (path_equal(path, "/"))
+ path = "";
+
+ return hashmap_get(m->units_requiring_mounts_for, path);
+}
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed) {
+ unsigned size;
+ int r;
+
+ assert(m);
+ assert(u->manager == m);
+
+ size = set_size(m->failed_units);
+
+ if (failed) {
+ r = set_ensure_put(&m->failed_units, NULL, u);
+ if (r < 0)
+ return log_oom();
+ } else
+ (void) set_remove(m->failed_units, u);
+
+ if (set_size(m->failed_units) != size)
+ bus_manager_send_change_signal(m);
+
+ return 0;
+}
+
+ManagerState manager_state(Manager *m) {
+ Unit *u;
+
+ assert(m);
+
+ /* Is the special shutdown target active or queued? If so, we are in shutdown state */
+ u = manager_get_unit(m, SPECIAL_SHUTDOWN_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_STOPPING;
+
+ /* Did we ever finish booting? If not then we are still starting up */
+ if (!MANAGER_IS_FINISHED(m)) {
+
+ u = manager_get_unit(m, SPECIAL_BASIC_TARGET);
+ if (!u || !UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return MANAGER_INITIALIZING;
+
+ return MANAGER_STARTING;
+ }
+
+ if (MANAGER_IS_SYSTEM(m)) {
+ /* Are the rescue or emergency targets active or queued? If so we are in maintenance state */
+ u = manager_get_unit(m, SPECIAL_RESCUE_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_MAINTENANCE;
+
+ u = manager_get_unit(m, SPECIAL_EMERGENCY_TARGET);
+ if (u && unit_active_or_pending(u))
+ return MANAGER_MAINTENANCE;
+ }
+
+ /* Are there any failed units? If so, we are in degraded mode */
+ if (set_size(m->failed_units) > 0)
+ return MANAGER_DEGRADED;
+
+ return MANAGER_RUNNING;
+}
+
+static void manager_unref_uid_internal(
+ Hashmap *uid_refs,
+ uid_t uid,
+ bool destroy_now,
+ int (*_clean_ipc)(uid_t uid)) {
+
+ uint32_t c, n;
+
+ assert(uid_is_valid(uid));
+ assert(_clean_ipc);
+
+ /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the
+ * assumption that uid_t and gid_t are actually defined the same way, with the same validity rules.
+ *
+ * We store a hashmap where the key is the UID/GID and the value is a 32-bit reference counter, whose
+ * highest bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last
+ * reference to the UID/GID is dropped. The flag is set to on, once at least one reference from a
+ * unit where RemoveIPC= is set is added on a UID/GID. It is reset when the UID's/GID's reference
+ * counter drops to 0 again. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (uid == 0) /* We don't keep track of root, and will never destroy it */
+ return;
+
+ c = PTR_TO_UINT32(hashmap_get(uid_refs, UID_TO_PTR(uid)));
+
+ n = c & ~DESTROY_IPC_FLAG;
+ assert(n > 0);
+ n--;
+
+ if (destroy_now && n == 0) {
+ hashmap_remove(uid_refs, UID_TO_PTR(uid));
+
+ if (c & DESTROY_IPC_FLAG) {
+ log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.",
+ _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+ uid);
+ (void) _clean_ipc(uid);
+ }
+ } else {
+ c = n | (c & DESTROY_IPC_FLAG);
+ assert_se(hashmap_update(uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0);
+ }
+}
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) {
+ manager_unref_uid_internal(m->uid_refs, uid, destroy_now, clean_ipc_by_uid);
+}
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) {
+ manager_unref_uid_internal(m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid);
+}
+
+static int manager_ref_uid_internal(
+ Hashmap **uid_refs,
+ uid_t uid,
+ bool clean_ipc) {
+
+ uint32_t c, n;
+ int r;
+
+ assert(uid_refs);
+ assert(uid_is_valid(uid));
+
+ /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the
+ * assumption that uid_t and gid_t are actually defined the same way, with the same validity
+ * rules. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (uid == 0) /* We don't keep track of root, and will never destroy it */
+ return 0;
+
+ r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops);
+ if (r < 0)
+ return r;
+
+ c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid)));
+
+ n = c & ~DESTROY_IPC_FLAG;
+ n++;
+
+ if (n & DESTROY_IPC_FLAG) /* check for overflow */
+ return -EOVERFLOW;
+
+ c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0);
+
+ return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c));
+}
+
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) {
+ return manager_ref_uid_internal(&m->uid_refs, uid, clean_ipc);
+}
+
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) {
+ return manager_ref_uid_internal(&m->gid_refs, (uid_t) gid, clean_ipc);
+}
+
+static void manager_vacuum_uid_refs_internal(
+ Hashmap *uid_refs,
+ int (*_clean_ipc)(uid_t uid)) {
+
+ void *p, *k;
+
+ assert(_clean_ipc);
+
+ HASHMAP_FOREACH_KEY(p, k, uid_refs) {
+ uint32_t c, n;
+ uid_t uid;
+
+ uid = PTR_TO_UID(k);
+ c = PTR_TO_UINT32(p);
+
+ n = c & ~DESTROY_IPC_FLAG;
+ if (n > 0)
+ continue;
+
+ if (c & DESTROY_IPC_FLAG) {
+ log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.",
+ _clean_ipc == clean_ipc_by_uid ? "UID" : "GID",
+ uid);
+ (void) _clean_ipc(uid);
+ }
+
+ assert_se(hashmap_remove(uid_refs, k) == p);
+ }
+}
+
+static void manager_vacuum_uid_refs(Manager *m) {
+ manager_vacuum_uid_refs_internal(m->uid_refs, clean_ipc_by_uid);
+}
+
+static void manager_vacuum_gid_refs(Manager *m) {
+ manager_vacuum_uid_refs_internal(m->gid_refs, clean_ipc_by_gid);
+}
+
+static void manager_vacuum(Manager *m) {
+ assert(m);
+
+ /* Release any dynamic users no longer referenced */
+ dynamic_user_vacuum(m, true);
+
+ /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */
+ manager_vacuum_uid_refs(m);
+ manager_vacuum_gid_refs(m);
+
+ /* Release any runtimes no longer referenced */
+ exec_shared_runtime_vacuum(m);
+}
+
+int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ struct buffer {
+ uid_t uid;
+ gid_t gid;
+ char unit_name[UNIT_NAME_MAX+1];
+ } _packed_ buffer;
+
+ Manager *m = userdata;
+ ssize_t l;
+ size_t n;
+ Unit *u;
+
+ assert_se(source);
+ assert_se(m);
+
+ /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the
+ * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that
+ * it can add a reference to the UID/GID so that it can destroy the UID/GID's IPC objects when the
+ * reference counter drops to 0. */
+
+ l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT);
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read from user lookup fd: %m");
+ }
+
+ if ((size_t) l <= offsetof(struct buffer, unit_name)) {
+ log_warning("Received too short user lookup message, ignoring.");
+ return 0;
+ }
+
+ if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) {
+ log_warning("Received too long user lookup message, ignoring.");
+ return 0;
+ }
+
+ if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) {
+ log_warning("Got user lookup message with invalid UID/GID pair, ignoring.");
+ return 0;
+ }
+
+ n = (size_t) l - offsetof(struct buffer, unit_name);
+ if (memchr(buffer.unit_name, 0, n)) {
+ log_warning("Received lookup message with embedded NUL character, ignoring.");
+ return 0;
+ }
+
+ buffer.unit_name[n] = 0;
+ u = manager_get_unit(m, buffer.unit_name);
+ if (!u) {
+ log_debug("Got user lookup message but unit doesn't exist, ignoring.");
+ return 0;
+ }
+
+ log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid);
+
+ unit_notify_user_lookup(u, buffer.uid, buffer.gid);
+ return 0;
+}
+
+static int short_uid_range(const char *path) {
+ _cleanup_(uid_range_freep) UidRange *p = NULL;
+ int r;
+
+ assert(path);
+
+ /* Taint systemd if we the UID range assigned to this environment doesn't at least cover 0…65534,
+ * i.e. from root to nobody. */
+
+ r = uid_range_load_userns(&p, path);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load %s: %m", path);
+
+ return !uid_range_covers(p, 0, 65535);
+}
+
+char* manager_taint_string(const Manager *m) {
+ /* Returns a "taint string", e.g. "local-hwclock:var-run-bad". Only things that are detected at
+ * runtime should be tagged here. For stuff that is known during compilation, emit a warning in the
+ * configuration phase. */
+
+ assert(m);
+
+ const char* stage[12] = {};
+ size_t n = 0;
+
+ _cleanup_free_ char *usrbin = NULL;
+ if (readlink_malloc("/bin", &usrbin) < 0 || !PATH_IN_SET(usrbin, "usr/bin", "/usr/bin"))
+ stage[n++] = "unmerged-usr";
+
+ if (access("/proc/cgroups", F_OK) < 0)
+ stage[n++] = "cgroups-missing";
+
+ if (cg_all_unified() == 0)
+ stage[n++] = "cgroupsv1";
+
+ if (clock_is_localtime(NULL) > 0)
+ stage[n++] = "local-hwclock";
+
+ if (os_release_support_ended(NULL, /* quiet= */ true, NULL) > 0)
+ stage[n++] = "support-ended";
+
+ _cleanup_free_ char *destination = NULL;
+ if (readlink_malloc("/var/run", &destination) < 0 ||
+ !PATH_IN_SET(destination, "../run", "/run"))
+ stage[n++] = "var-run-bad";
+
+ _cleanup_free_ char *overflowuid = NULL, *overflowgid = NULL;
+ if (read_one_line_file("/proc/sys/kernel/overflowuid", &overflowuid) >= 0 &&
+ !streq(overflowuid, "65534"))
+ stage[n++] = "overflowuid-not-65534";
+ if (read_one_line_file("/proc/sys/kernel/overflowgid", &overflowgid) >= 0 &&
+ !streq(overflowgid, "65534"))
+ stage[n++] = "overflowgid-not-65534";
+
+ struct utsname uts;
+ assert_se(uname(&uts) >= 0);
+ if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
+ stage[n++] = "old-kernel";
+
+ if (short_uid_range("/proc/self/uid_map") > 0)
+ stage[n++] = "short-uid-range";
+ if (short_uid_range("/proc/self/gid_map") > 0)
+ stage[n++] = "short-gid-range";
+
+ assert(n < ELEMENTSOF(stage) - 1); /* One extra for NULL terminator */
+
+ return strv_join((char**) stage, ":");
+}
+
+void manager_ref_console(Manager *m) {
+ assert(m);
+
+ m->n_on_console++;
+}
+
+void manager_unref_console(Manager *m) {
+
+ assert(m->n_on_console > 0);
+ m->n_on_console--;
+
+ if (m->n_on_console == 0)
+ m->no_console_output = false; /* unset no_console_output flag, since the console is definitely free now */
+}
+
+void manager_override_log_level(Manager *m, int level) {
+ _cleanup_free_ char *s = NULL;
+ assert(m);
+
+ if (!m->log_level_overridden) {
+ m->original_log_level = log_get_max_level();
+ m->log_level_overridden = true;
+ }
+
+ (void) log_level_to_string_alloc(level, &s);
+ log_info("Setting log level to %s.", strna(s));
+
+ log_set_max_level(level);
+}
+
+void manager_restore_original_log_level(Manager *m) {
+ _cleanup_free_ char *s = NULL;
+ assert(m);
+
+ if (!m->log_level_overridden)
+ return;
+
+ (void) log_level_to_string_alloc(m->original_log_level, &s);
+ log_info("Restoring log level to original (%s).", strna(s));
+
+ log_set_max_level(m->original_log_level);
+ m->log_level_overridden = false;
+}
+
+void manager_override_log_target(Manager *m, LogTarget target) {
+ assert(m);
+
+ if (!m->log_target_overridden) {
+ m->original_log_target = log_get_target();
+ m->log_target_overridden = true;
+ }
+
+ log_info("Setting log target to %s.", log_target_to_string(target));
+ log_set_target(target);
+}
+
+void manager_restore_original_log_target(Manager *m) {
+ assert(m);
+
+ if (!m->log_target_overridden)
+ return;
+
+ log_info("Restoring log target to original %s.", log_target_to_string(m->original_log_target));
+
+ log_set_target(m->original_log_target);
+ m->log_target_overridden = false;
+}
+
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s) {
+ if (in_initrd() &&
+ s >= MANAGER_TIMESTAMP_SECURITY_START &&
+ s <= MANAGER_TIMESTAMP_UNITS_LOAD_FINISH)
+ return s - MANAGER_TIMESTAMP_SECURITY_START + MANAGER_TIMESTAMP_INITRD_SECURITY_START;
+ return s;
+}
+
+int manager_allocate_idle_pipe(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (m->idle_pipe[0] >= 0) {
+ assert(m->idle_pipe[1] >= 0);
+ assert(m->idle_pipe[2] >= 0);
+ assert(m->idle_pipe[3] >= 0);
+ return 0;
+ }
+
+ assert(m->idle_pipe[1] < 0);
+ assert(m->idle_pipe[2] < 0);
+ assert(m->idle_pipe[3] < 0);
+
+ r = RET_NERRNO(pipe2(m->idle_pipe + 0, O_NONBLOCK|O_CLOEXEC));
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(pipe2(m->idle_pipe + 2, O_NONBLOCK|O_CLOEXEC));
+ if (r < 0) {
+ safe_close_pair(m->idle_pipe + 0);
+ return r;
+ }
+
+ return 1;
+}
+
+void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) {
+ assert(defaults);
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ *defaults = (UnitDefaults) {
+ .std_output = EXEC_OUTPUT_JOURNAL,
+ .std_error = EXEC_OUTPUT_INHERIT,
+ .restart_usec = DEFAULT_RESTART_USEC,
+ .timeout_start_usec = manager_default_timeout(scope),
+ .timeout_stop_usec = manager_default_timeout(scope),
+ .timeout_abort_usec = manager_default_timeout(scope),
+ .timeout_abort_set = false,
+ .device_timeout_usec = manager_default_timeout(scope),
+ .start_limit_interval = DEFAULT_START_LIMIT_INTERVAL,
+ .start_limit_burst = DEFAULT_START_LIMIT_BURST,
+
+ /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
+ * controller to be enabled, so the default is to enable it unless we got told otherwise. */
+ .cpu_accounting = cpu_accounting_is_cheap(),
+ .memory_accounting = MEMORY_ACCOUNTING_DEFAULT,
+ .io_accounting = false,
+ .blockio_accounting = false,
+ .tasks_accounting = true,
+ .ip_accounting = false,
+
+ .tasks_max = DEFAULT_TASKS_MAX,
+ .timer_accuracy_usec = 1 * USEC_PER_MINUTE,
+
+ .memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO,
+ .memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
+
+ .oom_policy = OOM_STOP,
+ .oom_score_adjust_set = false,
+ };
+}
+
+void unit_defaults_done(UnitDefaults *defaults) {
+ assert(defaults);
+
+ defaults->smack_process_label = mfree(defaults->smack_process_label);
+ rlimit_free_all(defaults->rlimit);
+}
+
+LogTarget manager_get_executor_log_target(Manager *m) {
+ assert(m);
+
+ /* If journald is not available tell sd-executor to go to kmsg, as it might be starting journald */
+
+ if (manager_journal_is_running(m))
+ return log_get_target();
+
+ return LOG_TARGET_KMSG;
+}
+
+static const char *const manager_state_table[_MANAGER_STATE_MAX] = {
+ [MANAGER_INITIALIZING] = "initializing",
+ [MANAGER_STARTING] = "starting",
+ [MANAGER_RUNNING] = "running",
+ [MANAGER_DEGRADED] = "degraded",
+ [MANAGER_MAINTENANCE] = "maintenance",
+ [MANAGER_STOPPING] = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_state, ManagerState);
+
+static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
+ [MANAGER_TIMESTAMP_FIRMWARE] = "firmware",
+ [MANAGER_TIMESTAMP_LOADER] = "loader",
+ [MANAGER_TIMESTAMP_KERNEL] = "kernel",
+ [MANAGER_TIMESTAMP_INITRD] = "initrd",
+ [MANAGER_TIMESTAMP_USERSPACE] = "userspace",
+ [MANAGER_TIMESTAMP_FINISH] = "finish",
+ [MANAGER_TIMESTAMP_SECURITY_START] = "security-start",
+ [MANAGER_TIMESTAMP_SECURITY_FINISH] = "security-finish",
+ [MANAGER_TIMESTAMP_GENERATORS_START] = "generators-start",
+ [MANAGER_TIMESTAMP_GENERATORS_FINISH] = "generators-finish",
+ [MANAGER_TIMESTAMP_UNITS_LOAD_START] = "units-load-start",
+ [MANAGER_TIMESTAMP_UNITS_LOAD_FINISH] = "units-load-finish",
+ [MANAGER_TIMESTAMP_UNITS_LOAD] = "units-load",
+ [MANAGER_TIMESTAMP_INITRD_SECURITY_START] = "initrd-security-start",
+ [MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH] = "initrd-security-finish",
+ [MANAGER_TIMESTAMP_INITRD_GENERATORS_START] = "initrd-generators-start",
+ [MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH] = "initrd-generators-finish",
+ [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START] = "initrd-units-load-start",
+ [MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH] = "initrd-units-load-finish",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
+
+static const char* const oom_policy_table[_OOM_POLICY_MAX] = {
+ [OOM_CONTINUE] = "continue",
+ [OOM_STOP] = "stop",
+ [OOM_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy);
diff --git a/src/core/manager.h b/src/core/manager.h
new file mode 100644
index 0000000..d96eb7b
--- /dev/null
+++ b/src/core/manager.h
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "sd-bus.h"
+#include "sd-device.h"
+#include "sd-event.h"
+
+#include "common-signal.h"
+#include "cgroup-util.h"
+#include "cgroup.h"
+#include "fdset.h"
+#include "hashmap.h"
+#include "list.h"
+#include "prioq.h"
+#include "ratelimit.h"
+#include "varlink.h"
+
+struct libmnt_monitor;
+typedef struct Unit Unit;
+
+/* Enforce upper limit how many names we allow */
+#define MANAGER_MAX_NAMES 131072 /* 128K */
+
+/* On sigrtmin+18, private commands */
+enum {
+ MANAGER_SIGNAL_COMMAND_DUMP_JOBS = _COMMON_SIGNAL_COMMAND_PRIVATE_BASE + 0,
+ _MANAGER_SIGNAL_COMMAND_MAX,
+};
+
+assert_cc((int) _MANAGER_SIGNAL_COMMAND_MAX <= (int) _COMMON_SIGNAL_COMMAND_PRIVATE_END);
+
+typedef struct Manager Manager;
+
+/* An externally visible state. We don't actually maintain this as state variable, but derive it from various fields
+ * when requested */
+typedef enum ManagerState {
+ MANAGER_INITIALIZING,
+ MANAGER_STARTING,
+ MANAGER_RUNNING,
+ MANAGER_DEGRADED,
+ MANAGER_MAINTENANCE,
+ MANAGER_STOPPING,
+ _MANAGER_STATE_MAX,
+ _MANAGER_STATE_INVALID = -EINVAL,
+} ManagerState;
+
+typedef enum ManagerObjective {
+ MANAGER_OK,
+ MANAGER_EXIT,
+ MANAGER_RELOAD,
+ MANAGER_REEXECUTE,
+ MANAGER_REBOOT,
+ MANAGER_SOFT_REBOOT,
+ MANAGER_POWEROFF,
+ MANAGER_HALT,
+ MANAGER_KEXEC,
+ MANAGER_SWITCH_ROOT,
+ _MANAGER_OBJECTIVE_MAX,
+ _MANAGER_OBJECTIVE_INVALID = -EINVAL,
+} ManagerObjective;
+
+typedef enum StatusType {
+ STATUS_TYPE_EPHEMERAL,
+ STATUS_TYPE_NORMAL,
+ STATUS_TYPE_NOTICE,
+ STATUS_TYPE_EMERGENCY,
+} StatusType;
+
+typedef enum OOMPolicy {
+ OOM_CONTINUE, /* The kernel or systemd-oomd kills the process it wants to kill, and that's it */
+ OOM_STOP, /* The kernel or systemd-oomd kills the process it wants to kill, and we stop the unit */
+ OOM_KILL, /* The kernel or systemd-oomd kills the process it wants to kill, and all others in the unit, and we stop the unit */
+ _OOM_POLICY_MAX,
+ _OOM_POLICY_INVALID = -EINVAL,
+} OOMPolicy;
+
+/* Notes:
+ * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD,
+ * TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when
+ * the manager is system and not running under container environment.
+ *
+ * 2. The monotonic timestamp of TIMESTAMP_KERNEL is always zero.
+ *
+ * 3. The realtime timestamp of TIMESTAMP_KERNEL will be unset if the system does not
+ * have RTC.
+ *
+ * 4. TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER will be unset if the system does not
+ * have RTC, or systemd is built without EFI support.
+ *
+ * 5. The monotonic timestamps of TIMESTAMP_FIRMWARE and TIMESTAMP_LOADER are stored as
+ * negative of the actual value.
+ *
+ * 6. TIMESTAMP_USERSPACE is the timestamp of when the manager was started.
+ *
+ * 7. TIMESTAMP_INITRD_* are set only when the system is booted with an initrd.
+ */
+
+typedef enum ManagerTimestamp {
+ MANAGER_TIMESTAMP_FIRMWARE,
+ MANAGER_TIMESTAMP_LOADER,
+ MANAGER_TIMESTAMP_KERNEL,
+ MANAGER_TIMESTAMP_INITRD,
+ MANAGER_TIMESTAMP_USERSPACE,
+ MANAGER_TIMESTAMP_FINISH,
+
+ MANAGER_TIMESTAMP_SECURITY_START,
+ MANAGER_TIMESTAMP_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_GENERATORS_START,
+ MANAGER_TIMESTAMP_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD_START,
+ MANAGER_TIMESTAMP_UNITS_LOAD_FINISH,
+ MANAGER_TIMESTAMP_UNITS_LOAD,
+
+ MANAGER_TIMESTAMP_INITRD_SECURITY_START,
+ MANAGER_TIMESTAMP_INITRD_SECURITY_FINISH,
+ MANAGER_TIMESTAMP_INITRD_GENERATORS_START,
+ MANAGER_TIMESTAMP_INITRD_GENERATORS_FINISH,
+ MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_START,
+ MANAGER_TIMESTAMP_INITRD_UNITS_LOAD_FINISH,
+ _MANAGER_TIMESTAMP_MAX,
+ _MANAGER_TIMESTAMP_INVALID = -EINVAL,
+} ManagerTimestamp;
+
+typedef enum WatchdogType {
+ WATCHDOG_RUNTIME,
+ WATCHDOG_REBOOT,
+ WATCHDOG_KEXEC,
+ WATCHDOG_PRETIMEOUT,
+ _WATCHDOG_TYPE_MAX,
+} WatchdogType;
+
+#include "execute.h"
+#include "job.h"
+#include "path-lookup.h"
+#include "show-status.h"
+#include "unit-name.h"
+
+typedef enum ManagerTestRunFlags {
+ MANAGER_TEST_NORMAL = 0, /* run normally */
+ MANAGER_TEST_RUN_MINIMAL = 1 << 0, /* create basic data structures */
+ MANAGER_TEST_RUN_BASIC = 1 << 1, /* interact with the environment */
+ MANAGER_TEST_RUN_ENV_GENERATORS = 1 << 2, /* also run env generators */
+ MANAGER_TEST_RUN_GENERATORS = 1 << 3, /* also run unit generators */
+ MANAGER_TEST_RUN_IGNORE_DEPENDENCIES = 1 << 4, /* run while ignoring dependencies */
+ MANAGER_TEST_DONT_OPEN_EXECUTOR = 1 << 5, /* avoid trying to load sd-executor */
+ MANAGER_TEST_FULL = MANAGER_TEST_RUN_BASIC | MANAGER_TEST_RUN_ENV_GENERATORS | MANAGER_TEST_RUN_GENERATORS,
+} ManagerTestRunFlags;
+
+assert_cc((MANAGER_TEST_FULL & UINT8_MAX) == MANAGER_TEST_FULL);
+
+/* Various defaults for unit file settings. */
+typedef struct UnitDefaults {
+ ExecOutput std_output, std_error;
+
+ usec_t restart_usec, timeout_start_usec, timeout_stop_usec, timeout_abort_usec, device_timeout_usec;
+ bool timeout_abort_set;
+
+ usec_t start_limit_interval;
+ unsigned start_limit_burst;
+
+ bool cpu_accounting;
+ bool memory_accounting;
+ bool io_accounting;
+ bool blockio_accounting;
+ bool tasks_accounting;
+ bool ip_accounting;
+
+ CGroupTasksMax tasks_max;
+ usec_t timer_accuracy_usec;
+
+ OOMPolicy oom_policy;
+ int oom_score_adjust;
+ bool oom_score_adjust_set;
+
+ CGroupPressureWatch memory_pressure_watch;
+ usec_t memory_pressure_threshold_usec;
+
+ char *smack_process_label;
+
+ struct rlimit *rlimit[_RLIMIT_MAX];
+} UnitDefaults;
+
+struct Manager {
+ /* Note that the set of units we know of is allowed to be
+ * inconsistent. However the subset of it that is loaded may
+ * not, and the list of jobs may neither. */
+
+ /* Active jobs and units */
+ Hashmap *units; /* name string => Unit object n:1 */
+ Hashmap *units_by_invocation_id;
+ Hashmap *jobs; /* job id => Job object 1:1 */
+
+ /* To make it easy to iterate through the units of a specific
+ * type we maintain a per type linked list */
+ LIST_HEAD(Unit, units_by_type[_UNIT_TYPE_MAX]);
+
+ /* Units that need to be loaded */
+ LIST_HEAD(Unit, load_queue); /* this is actually more a stack than a queue, but uh. */
+
+ /* Jobs that need to be run */
+ struct Prioq *run_queue;
+
+ /* Units and jobs that have not yet been announced via
+ * D-Bus. When something about a job changes it is added here
+ * if it is not in there yet. This allows easy coalescing of
+ * D-Bus change signals. */
+ LIST_HEAD(Unit, dbus_unit_queue);
+ LIST_HEAD(Job, dbus_job_queue);
+
+ /* Units to remove */
+ LIST_HEAD(Unit, cleanup_queue);
+
+ /* Units and jobs to check when doing GC */
+ LIST_HEAD(Unit, gc_unit_queue);
+ LIST_HEAD(Job, gc_job_queue);
+
+ /* Units that should be realized */
+ LIST_HEAD(Unit, cgroup_realize_queue);
+
+ /* Units whose cgroup ran empty */
+ LIST_HEAD(Unit, cgroup_empty_queue);
+
+ /* Units whose memory.event fired */
+ LIST_HEAD(Unit, cgroup_oom_queue);
+
+ /* Target units whose default target dependencies haven't been set yet */
+ LIST_HEAD(Unit, target_deps_queue);
+
+ /* Units that might be subject to StopWhenUnneeded= clean-up */
+ LIST_HEAD(Unit, stop_when_unneeded_queue);
+
+ /* Units which are upheld by another other which we might need to act on */
+ LIST_HEAD(Unit, start_when_upheld_queue);
+
+ /* Units that have BindsTo= another unit, and might need to be shutdown because the bound unit is not active. */
+ LIST_HEAD(Unit, stop_when_bound_queue);
+
+ /* Units that have resources open, and where it might be good to check if they can be released now */
+ LIST_HEAD(Unit, release_resources_queue);
+
+ sd_event *event;
+
+ /* This maps PIDs we care about to units that are interested in them. We allow multiple units to be
+ * interested in the same PID and multiple PIDs to be relevant to the same unit. Since in most cases
+ * only a single unit will be interested in the same PID though, we use a somewhat special structure
+ * here: the first unit interested in a PID is stored in the hashmap 'watch_pids', keyed by the
+ * PID. If there are other units interested too they'll be stored in a NULL-terminated array, stored
+ * in the hashmap 'watch_pids_more', keyed by the PID. Thus to go through the full list of units
+ * interested in a PID we must look into both hashmaps. */
+ Hashmap *watch_pids; /* PidRef* → Unit* */
+ Hashmap *watch_pids_more; /* PidRef* → NUL terminated array of Unit* */
+
+ /* A set contains all units which cgroup should be refreshed after startup */
+ Set *startup_units;
+
+ /* A set which contains all currently failed units */
+ Set *failed_units;
+
+ sd_event_source *run_queue_event_source;
+
+ char *notify_socket;
+ int notify_fd;
+ sd_event_source *notify_event_source;
+
+ int cgroups_agent_fd;
+ sd_event_source *cgroups_agent_event_source;
+
+ int signal_fd;
+ sd_event_source *signal_event_source;
+
+ sd_event_source *sigchld_event_source;
+
+ sd_event_source *time_change_event_source;
+
+ sd_event_source *timezone_change_event_source;
+
+ sd_event_source *jobs_in_progress_event_source;
+
+ int user_lookup_fds[2];
+ sd_event_source *user_lookup_event_source;
+
+ RuntimeScope runtime_scope;
+
+ LookupPaths lookup_paths;
+ Hashmap *unit_id_map;
+ Hashmap *unit_name_map;
+ Set *unit_path_cache;
+ uint64_t unit_cache_timestamp_hash;
+
+ /* We don't have support for atomically enabling/disabling units, and unit_file_state might become
+ * outdated if such operations failed half-way. Therefore, we set this flag if changes to unit files
+ * are made, and reset it after daemon-reload. If set, we report that daemon-reload is needed through
+ * unit's NeedDaemonReload property. */
+ bool unit_file_state_outdated;
+
+ char **transient_environment; /* The environment, as determined from config files, kernel cmdline and environment generators */
+ char **client_environment; /* Environment variables created by clients through the bus API */
+
+ usec_t watchdog[_WATCHDOG_TYPE_MAX];
+ usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX];
+ char *watchdog_pretimeout_governor;
+ char *watchdog_pretimeout_governor_overridden;
+
+ dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX];
+
+ /* Data specific to the device subsystem */
+ sd_device_monitor *device_monitor;
+ Hashmap *devices_by_sysfs;
+
+ /* Data specific to the mount subsystem */
+ struct libmnt_monitor *mount_monitor;
+ sd_event_source *mount_event_source;
+
+ /* Data specific to the swap filesystem */
+ FILE *proc_swaps;
+ sd_event_source *swap_event_source;
+ Hashmap *swaps_by_devnode;
+
+ /* Data specific to the D-Bus subsystem */
+ sd_bus *api_bus, *system_bus;
+ Set *private_buses;
+ int private_listen_fd;
+ sd_event_source *private_listen_event_source;
+
+ /* Contains all the clients that are subscribed to signals via
+ the API bus. Note that private bus connections are always
+ considered subscribes, since they last for very short only,
+ and it is much simpler that way. */
+ sd_bus_track *subscribed;
+ char **deserialized_subscribed;
+
+ /* This is used during reloading: before the reload we queue
+ * the reply message here, and afterwards we send it */
+ sd_bus_message *pending_reload_message;
+
+ Hashmap *watch_bus; /* D-Bus names => Unit object n:1 */
+
+ bool send_reloading_done;
+
+ uint32_t current_job_id;
+ uint32_t default_unit_job_id;
+
+ /* Data specific to the Automount subsystem */
+ int dev_autofs_fd;
+
+ /* Data specific to the cgroup subsystem */
+ Hashmap *cgroup_unit;
+ CGroupMask cgroup_supported;
+ char *cgroup_root;
+
+ /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
+ int cgroup_inotify_fd;
+ sd_event_source *cgroup_inotify_event_source;
+
+ /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and
+ * memory.events cgroupv2 attributes. */
+ Hashmap *cgroup_control_inotify_wd_unit;
+ Hashmap *cgroup_memory_inotify_wd_unit;
+
+ /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
+ sd_event_source *cgroup_empty_event_source;
+ sd_event_source *cgroup_oom_event_source;
+
+ /* Make sure the user cannot accidentally unmount our cgroup
+ * file system */
+ int pin_cgroupfs_fd;
+
+ unsigned gc_marker;
+
+ /* The stat() data the last time we saw /etc/localtime */
+ usec_t etc_localtime_mtime;
+ bool etc_localtime_accessible;
+
+ ManagerObjective objective;
+
+ /* Flags */
+ bool dispatching_load_queue;
+
+ /* Have we already sent out the READY=1 notification? */
+ bool ready_sent;
+
+ /* Was the last status sent "STATUS=Ready."? */
+ bool status_ready;
+
+ /* Have we already printed the taint line if necessary? */
+ bool taint_logged;
+
+ /* Have we ever changed the "kernel.pid_max" sysctl? */
+ bool sysctl_pid_max_changed;
+
+ ManagerTestRunFlags test_run_flags;
+
+ /* If non-zero, exit with the following value when the systemd
+ * process terminate. Useful for containers: systemd-nspawn could get
+ * the return value. */
+ uint8_t return_value;
+
+ ShowStatus show_status;
+ ShowStatus show_status_overridden;
+ StatusUnitFormat status_unit_format;
+ char *confirm_spawn;
+ bool no_console_output;
+ bool service_watchdogs;
+
+ UnitDefaults defaults;
+
+ int original_log_level;
+ LogTarget original_log_target;
+ bool log_level_overridden;
+ bool log_target_overridden;
+
+ /* non-zero if we are reloading or reexecuting, */
+ int n_reloading;
+
+ unsigned n_installed_jobs;
+ unsigned n_failed_jobs;
+
+ /* Jobs in progress watching */
+ unsigned n_running_jobs;
+ unsigned n_on_console;
+ unsigned jobs_in_progress_iteration;
+
+ /* Do we have any outstanding password prompts? */
+ int have_ask_password;
+ int ask_password_inotify_fd;
+ sd_event_source *ask_password_event_source;
+
+ /* Type=idle pipes */
+ int idle_pipe[4];
+ sd_event_source *idle_pipe_event_source;
+
+ char *switch_root;
+ char *switch_root_init;
+
+ /* This is true before and after switching root. */
+ bool switching_root;
+
+ /* This maps all possible path prefixes to the units needing
+ * them. It's a hashmap with a path string as key and a Set as
+ * value where Unit objects are contained. */
+ Hashmap *units_requiring_mounts_for;
+
+ /* Used for processing polkit authorization responses */
+ Hashmap *polkit_registry;
+
+ /* Dynamic users/groups, indexed by their name */
+ Hashmap *dynamic_users;
+
+ /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */
+ Hashmap *uid_refs;
+ Hashmap *gid_refs;
+
+ /* ExecSharedRuntime, indexed by their owner unit id */
+ Hashmap *exec_shared_runtime_by_id;
+
+ /* When the user hits C-A-D more than 7 times per 2s, do something immediately... */
+ RateLimit ctrl_alt_del_ratelimit;
+ EmergencyAction cad_burst_action;
+
+ const char *unit_log_field;
+ const char *unit_log_format_string;
+
+ const char *invocation_log_field;
+ const char *invocation_log_format_string;
+
+ int first_boot; /* tri-state */
+
+ /* Prefixes of e.g. RuntimeDirectory= */
+ char *prefix[_EXEC_DIRECTORY_TYPE_MAX];
+ char *received_credentials_directory;
+ char *received_encrypted_credentials_directory;
+
+ /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event
+ * multiple times on the same unit. */
+ unsigned sigchldgen;
+ unsigned notifygen;
+
+ VarlinkServer *varlink_server;
+ /* When we're a system manager, this object manages the subscription from systemd-oomd to PID1 that's
+ * used to report changes in ManagedOOM settings (systemd server - oomd client). When
+ * we're a user manager, this object manages the client connection from the user manager to
+ * systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */
+ Varlink *managed_oom_varlink;
+
+ /* Reference to RestrictFileSystems= BPF program */
+ struct restrict_fs_bpf *restrict_fs;
+
+ /* Allow users to configure a rate limit for Reload() operations */
+ RateLimit reload_ratelimit;
+ /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
+ RateLimit dump_ratelimit;
+
+ sd_event_source *memory_pressure_event_source;
+
+ /* For NFTSet= */
+ FirewallContext *fw_ctx;
+
+ /* Pin the systemd-executor binary, so that it never changes until re-exec, ensuring we don't have
+ * serialization/deserialization compatibility issues during upgrades. */
+ int executor_fd;
+};
+
+static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
+ assert(m);
+ return m->defaults.timeout_abort_set ? m->defaults.timeout_abort_usec : m->defaults.timeout_stop_usec;
+}
+
+#define MANAGER_IS_SYSTEM(m) ((m)->runtime_scope == RUNTIME_SCOPE_SYSTEM)
+#define MANAGER_IS_USER(m) ((m)->runtime_scope == RUNTIME_SCOPE_USER)
+
+#define MANAGER_IS_RELOADING(m) ((m)->n_reloading > 0)
+
+#define MANAGER_IS_FINISHED(m) (dual_timestamp_is_set((m)->timestamps + MANAGER_TIMESTAMP_FINISH))
+
+/* The objective is set to OK as soon as we enter the main loop, and set otherwise as soon as we are done with it */
+#define MANAGER_IS_RUNNING(m) ((m)->objective == MANAGER_OK)
+
+#define MANAGER_IS_SWITCHING_ROOT(m) ((m)->switching_root)
+
+#define MANAGER_IS_TEST_RUN(m) ((m)->test_run_flags != 0)
+
+static inline usec_t manager_default_timeout(RuntimeScope scope) {
+ return scope == RUNTIME_SCOPE_SYSTEM ? DEFAULT_TIMEOUT_USEC : DEFAULT_USER_TIMEOUT_USEC;
+}
+
+int manager_new(RuntimeScope scope, ManagerTestRunFlags test_run_flags, Manager **m);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *root);
+
+Job *manager_get_job(Manager *m, uint32_t id);
+Unit *manager_get_unit(Manager *m, const char *name);
+
+int manager_get_job_from_dbus_path(Manager *m, const char *s, Job **_j);
+
+bool manager_unit_cache_should_retry_load(Unit *u);
+int manager_load_unit_prepare(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_unit(Manager *m, const char *name, const char *path, sd_bus_error *e, Unit **ret);
+int manager_load_startable_unit_or_warn(Manager *m, const char *name, const char *path, Unit **ret);
+int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u);
+
+int manager_add_job(Manager *m, JobType type, Unit *unit, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, sd_bus_error *e, Job **_ret);
+int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret);
+int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
+
+void manager_clear_jobs(Manager *m);
+
+void manager_unwatch_pidref(Manager *m, PidRef *pid);
+
+unsigned manager_dispatch_load_queue(Manager *m);
+
+int manager_setup_memory_pressure_event_source(Manager *m);
+
+int manager_default_environment(Manager *m);
+int manager_transient_environment_add(Manager *m, char **plus);
+int manager_client_environment_modify(Manager *m, char **minus, char **plus);
+int manager_get_effective_environment(Manager *m, char ***ret);
+
+int manager_set_unit_defaults(Manager *m, const UnitDefaults *defaults);
+
+void manager_trigger_run_queue(Manager *m);
+
+int manager_loop(Manager *m);
+
+int manager_reload(Manager *m);
+Manager* manager_reloading_start(Manager *m);
+void manager_reloading_stopp(Manager **m);
+
+void manager_reset_failed(Manager *m);
+
+void manager_send_unit_audit(Manager *m, Unit *u, int type, bool success);
+void manager_send_unit_plymouth(Manager *m, Unit *u);
+
+bool manager_unit_inactive_or_pending(Manager *m, const char *name);
+
+void manager_check_finished(Manager *m);
+void manager_send_reloading(Manager *m);
+
+void disable_printk_ratelimit(void);
+void manager_recheck_dbus(Manager *m);
+void manager_recheck_journal(Manager *m);
+
+bool manager_get_show_status_on(Manager *m);
+void manager_set_show_status(Manager *m, ShowStatus mode, const char *reason);
+void manager_override_show_status(Manager *m, ShowStatus mode, const char *reason);
+
+void manager_set_first_boot(Manager *m, bool b);
+void manager_set_switching_root(Manager *m, bool switching_root);
+
+double manager_get_progress(Manager *m);
+
+void manager_status_printf(Manager *m, StatusType type, const char *status, const char *format, ...) _printf_(4,5);
+
+Set *manager_get_units_requiring_mounts_for(Manager *m, const char *path);
+
+ManagerState manager_state(Manager *m);
+
+int manager_update_failed_units(Manager *m, Unit *u, bool failed);
+
+void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now);
+int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc);
+
+void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now);
+int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc);
+
+char* manager_taint_string(const Manager *m);
+
+void manager_ref_console(Manager *m);
+void manager_unref_console(Manager *m);
+
+void manager_override_log_level(Manager *m, int level);
+void manager_restore_original_log_level(Manager *m);
+
+void manager_override_log_target(Manager *m, LogTarget target);
+void manager_restore_original_log_target(Manager *m);
+
+const char *manager_state_to_string(ManagerState m) _const_;
+ManagerState manager_state_from_string(const char *s) _pure_;
+
+const char *manager_get_confirm_spawn(Manager *m);
+void manager_disable_confirm_spawn(void);
+
+const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
+ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
+ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
+
+usec_t manager_get_watchdog(Manager *m, WatchdogType t);
+void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+void manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout);
+int manager_set_watchdog_pretimeout_governor(Manager *m, const char *governor);
+int manager_override_watchdog_pretimeout_governor(Manager *m, const char *governor);
+
+LogTarget manager_get_executor_log_target(Manager *m);
+
+int manager_allocate_idle_pipe(Manager *m);
+
+const char* oom_policy_to_string(OOMPolicy i) _const_;
+OOMPolicy oom_policy_from_string(const char *s) _pure_;
+
+void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope);
+void unit_defaults_done(UnitDefaults *defaults);
diff --git a/src/core/meson.build b/src/core/meson.build
new file mode 100644
index 0000000..7701d3d
--- /dev/null
+++ b/src/core/meson.build
@@ -0,0 +1,260 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+libcore_sources = files(
+ 'apparmor-setup.c',
+ 'audit-fd.c',
+ 'automount.c',
+ 'bpf-devices.c',
+ 'bpf-firewall.c',
+ 'bpf-foreign.c',
+ 'bpf-lsm.c',
+ 'bpf-socket-bind.c',
+ 'cgroup.c',
+ 'core-varlink.c',
+ 'dbus-automount.c',
+ 'dbus-cgroup.c',
+ 'dbus-device.c',
+ 'dbus-execute.c',
+ 'dbus-job.c',
+ 'dbus-kill.c',
+ 'dbus-manager.c',
+ 'dbus-mount.c',
+ 'dbus-path.c',
+ 'dbus-scope.c',
+ 'dbus-service.c',
+ 'dbus-slice.c',
+ 'dbus-socket.c',
+ 'dbus-swap.c',
+ 'dbus-target.c',
+ 'dbus-timer.c',
+ 'dbus-unit.c',
+ 'dbus-util.c',
+ 'dbus.c',
+ 'device.c',
+ 'dynamic-user.c',
+ 'efi-random.c',
+ 'emergency-action.c',
+ 'exec-credential.c',
+ 'execute.c',
+ 'execute-serialize.c',
+ 'generator-setup.c',
+ 'ima-setup.c',
+ 'import-creds.c',
+ 'job.c',
+ 'kill.c',
+ 'kmod-setup.c',
+ 'load-dropin.c',
+ 'load-fragment.c',
+ 'manager-dump.c',
+ 'manager-serialize.c',
+ 'manager.c',
+ 'mount.c',
+ 'namespace.c',
+ 'path.c',
+ 'restrict-ifaces.c',
+ 'scope.c',
+ 'selinux-access.c',
+ 'selinux-setup.c',
+ 'service.c',
+ 'show-status.c',
+ 'slice.c',
+ 'smack-setup.c',
+ 'socket.c',
+ 'swap.c',
+ 'target.c',
+ 'timer.c',
+ 'transaction.c',
+ 'unit-dependency-atom.c',
+ 'unit-printf.c',
+ 'unit-serialize.c',
+ 'unit.c',
+)
+
+if conf.get('BPF_FRAMEWORK') == 1
+ libcore_sources += files(
+ 'bpf-util.c',
+ )
+endif
+
+subdir('bpf/socket_bind')
+subdir('bpf/restrict_fs')
+subdir('bpf/restrict_ifaces')
+
+if conf.get('BPF_FRAMEWORK') == 1
+ libcore_sources += [
+ socket_bind_skel_h,
+ restrict_fs_skel_h,
+ restrict_ifaces_skel_h]
+endif
+
+load_fragment_gperf_gperf = custom_target(
+ 'load-fragment-gperf.gperf',
+ input : 'load-fragment-gperf.gperf.in',
+ output: 'load-fragment-gperf.gperf',
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'])
+
+load_fragment_gperf_c = custom_target(
+ 'load-fragment-gperf.c',
+ input : load_fragment_gperf_gperf,
+ output : 'load-fragment-gperf.c',
+ command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@'])
+
+awkscript = 'load-fragment-gperf-nulstr.awk'
+load_fragment_gperf_nulstr_c = custom_target(
+ 'load-fragment-gperf-nulstr.c',
+ input : [awkscript, load_fragment_gperf_gperf],
+ output : 'load-fragment-gperf-nulstr.c',
+ command : [awk, '-f', '@INPUT0@', '@INPUT1@'],
+ capture : true)
+
+libcore_name = 'systemd-core-@0@'.format(shared_lib_tag)
+
+libcore = shared_library(
+ libcore_name,
+ libcore_sources,
+ load_fragment_gperf_c,
+ load_fragment_gperf_nulstr_c,
+ include_directories : includes,
+ c_args : ['-fvisibility=default'],
+ link_args : ['-shared',
+ '-Wl,--version-script=' + libshared_sym_path],
+ link_depends : libshared_sym_path,
+ link_with : libshared,
+ dependencies : [libacl,
+ libapparmor,
+ libaudit,
+ libblkid,
+ libdl,
+ libkmod,
+ libm,
+ libmount,
+ libpam,
+ librt,
+ libseccomp,
+ libselinux,
+ threads,
+ userspace],
+ install : true,
+ install_dir : pkglibdir)
+
+core_includes = [includes, include_directories('.')]
+
+systemd_sources = files(
+ 'main.c',
+ 'crash-handler.c',
+)
+
+systemd_executor_sources = files(
+ 'executor.c',
+ 'exec-invoke.c',
+)
+
+executables += [
+ libexec_template + {
+ 'name' : 'systemd',
+ 'dbus' : true,
+ 'public' : true,
+ 'sources' : systemd_sources,
+ 'link_with' : [
+ libcore,
+ libshared,
+ ],
+ 'dependencies' : libseccomp,
+ },
+ libexec_template + {
+ 'name' : 'systemd-executor',
+ 'public' : true,
+ 'sources' : systemd_executor_sources,
+ 'include_directories' : core_includes,
+ 'link_with' : [
+ libcore,
+ libshared,
+ ],
+ 'dependencies' : [
+ libapparmor,
+ libpam,
+ libseccomp,
+ libselinux,
+ ],
+ },
+ fuzz_template + {
+ 'sources' : files('fuzz-unit-file.c'),
+ 'link_with' : [
+ libcore,
+ libshared
+ ],
+ 'dependencies' : libmount,
+ },
+ fuzz_template + {
+ 'sources' : files('fuzz-manager-serialize.c'),
+ 'link_with' : [
+ libcore,
+ libshared
+ ],
+ },
+ fuzz_template + {
+ 'sources' : files('fuzz-execute-serialize.c'),
+ 'link_with' : [
+ libcore,
+ libshared
+ ],
+ },
+]
+
+in_files = [['system.conf', pkgconfigfiledir],
+ ['user.conf', pkgconfigfiledir],
+ ['org.freedesktop.systemd1.policy', polkitpolicydir]]
+
+foreach item : in_files
+ file = item[0]
+ dir = item[1]
+
+ custom_target(
+ file,
+ input : file + '.in',
+ output: file,
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+ install : (dir == pkgconfigfiledir) ? install_sysconfdir_samples : (dir != 'no'),
+ install_dir : dir)
+endforeach
+
+systemd_pc = custom_target(
+ 'systemd.pc',
+ input : 'systemd.pc.in',
+ output : 'systemd.pc',
+ command : [jinja2_cmdline, '@INPUT@', '@OUTPUT@'],
+ install : pkgconfigdatadir != 'no',
+ install_tag : 'devel',
+ install_dir : pkgconfigdatadir)
+
+install_data('org.freedesktop.systemd1.conf',
+ install_dir : dbuspolicydir)
+install_data('org.freedesktop.systemd1.service',
+ install_dir : dbussystemservicedir)
+
+install_emptydir(systemshutdowndir)
+install_emptydir(systemsleepdir)
+install_emptydir(systemgeneratordir)
+install_emptydir(usergeneratordir)
+
+if install_sysconfdir
+ install_emptydir(pkgsysconfdir / 'system')
+ install_emptydir(pkgsysconfdir / 'user')
+ install_emptydir(sysconfdir / 'xdg/systemd')
+ meson.add_install_script(sh, '-c', ln_s.format(pkgsysconfdir / 'user',
+ sysconfdir / 'xdg/systemd/user'))
+endif
+
+install_emptydir(sbindir)
+meson.add_install_script(sh, '-c', ln_s.format(libexecdir / 'systemd', sbindir / 'init'))
+
+############################################################
+
+core_test_template = test_template + {
+ 'link_with' : [
+ libcore,
+ libshared,
+ ],
+ 'include_directories' : core_includes,
+ 'suite' : 'core',
+}
diff --git a/src/core/mount.c b/src/core/mount.c
new file mode 100644
index 0000000..ded322d
--- /dev/null
+++ b/src/core/mount.c
@@ -0,0 +1,2502 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/epoll.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "dbus-mount.h"
+#include "dbus-unit.h"
+#include "device.h"
+#include "exit-status.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "fstab-util.h"
+#include "initrd-util.h"
+#include "libmount-util.h"
+#include "log.h"
+#include "manager.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "mount.h"
+#include "mountpoint-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "utf8.h"
+
+#define RETRY_UMOUNT_MAX 32
+
+static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
+ [MOUNT_DEAD] = UNIT_INACTIVE,
+ [MOUNT_MOUNTING] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTING_DONE] = UNIT_ACTIVATING,
+ [MOUNT_MOUNTED] = UNIT_ACTIVE,
+ [MOUNT_REMOUNTING] = UNIT_RELOADING,
+ [MOUNT_UNMOUNTING] = UNIT_DEACTIVATING,
+ [MOUNT_REMOUNTING_SIGTERM] = UNIT_RELOADING,
+ [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING,
+ [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING,
+ [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING,
+ [MOUNT_FAILED] = UNIT_FAILED,
+ [MOUNT_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static void mount_enter_dead(Mount *m, MountResult f);
+static void mount_enter_mounted(Mount *m, MountResult f);
+static void mount_cycle_clear(Mount *m);
+static int mount_process_proc_self_mountinfo(Manager *m);
+
+static bool MOUNT_STATE_WITH_PROCESS(MountState state) {
+ return IN_SET(state,
+ MOUNT_MOUNTING,
+ MOUNT_MOUNTING_DONE,
+ MOUNT_REMOUNTING,
+ MOUNT_REMOUNTING_SIGTERM,
+ MOUNT_REMOUNTING_SIGKILL,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_CLEANING);
+}
+
+static MountParameters* get_mount_parameters_fragment(Mount *m) {
+ assert(m);
+
+ if (m->from_fragment)
+ return &m->parameters_fragment;
+
+ return NULL;
+}
+
+static MountParameters* get_mount_parameters(Mount *m) {
+ assert(m);
+
+ if (m->from_proc_self_mountinfo)
+ return &m->parameters_proc_self_mountinfo;
+
+ return get_mount_parameters_fragment(m);
+}
+
+static bool mount_is_network(const MountParameters *p) {
+ assert(p);
+
+ if (fstab_test_option(p->options, "_netdev\0"))
+ return true;
+
+ if (p->fstype && fstype_is_network(p->fstype))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_nofail(const Mount *m) {
+ assert(m);
+
+ if (!m->from_fragment)
+ return false;
+
+ return fstab_test_yes_no_option(m->parameters_fragment.options, "nofail\0" "fail\0");
+}
+
+static bool mount_is_loop(const MountParameters *p) {
+ assert(p);
+
+ if (fstab_test_option(p->options, "loop\0"))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_bind(const MountParameters *p) {
+ assert(p);
+ return fstab_is_bind(p->options, p->fstype);
+}
+
+static int mount_is_bound_to_device(Mount *m) {
+ _cleanup_free_ char *value = NULL;
+ const MountParameters *p;
+ int r;
+
+ assert(m);
+
+ /* Determines whether to place a Requires= or BindsTo= dependency on the backing device unit. We do
+ * this by checking for the x-systemd.device-bound= mount option. If it is enabled we use BindsTo=,
+ * otherwise Requires=. But note that we might combine the latter with StopPropagatedFrom=, see
+ * below. */
+
+ p = get_mount_parameters(m);
+ if (!p)
+ return false;
+
+ r = fstab_filter_options(p->options, "x-systemd.device-bound\0", NULL, &value, NULL, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EIDRM; /* If unspecified at all, return recognizable error */
+
+ if (isempty(value))
+ return true;
+
+ return parse_boolean(value);
+}
+
+static bool mount_propagate_stop(Mount *m) {
+ int r;
+
+ assert(m);
+
+ r = mount_is_bound_to_device(m);
+ if (r >= 0)
+ /* If x-systemd.device-bound=no is explicitly requested by user, don't try to set StopPropagatedFrom=.
+ * Also don't bother if true, since with BindsTo= the stop propagation is implicit. */
+ return false;
+ if (r != -EIDRM)
+ log_debug_errno(r, "Failed to get x-systemd.device-bound= option, ignoring: %m");
+
+ return m->from_fragment; /* let's propagate stop whenever this is an explicitly configured unit,
+ * otherwise let's not bother. */
+}
+
+static bool mount_needs_quota(const MountParameters *p) {
+ assert(p);
+
+ if (p->fstype && !fstype_needs_quota(p->fstype))
+ return false;
+
+ if (mount_is_bind(p))
+ return false;
+
+ return fstab_test_option(p->options,
+ "usrquota\0" "grpquota\0" "quota\0" "usrjquota\0" "grpjquota\0");
+}
+
+static void mount_init(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ m->timeout_usec = u->manager->defaults.timeout_start_usec;
+
+ m->exec_context.std_output = u->manager->defaults.std_output;
+ m->exec_context.std_error = u->manager->defaults.std_error;
+
+ m->directory_mode = 0755;
+
+ /* We need to make sure that /usr/bin/mount is always called
+ * in the same process group as us, so that the autofs kernel
+ * side doesn't send us another mount request while we are
+ * already trying to comply its last one. */
+ m->exec_context.same_pgrp = true;
+
+ m->control_pid = PIDREF_NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+ u->ignore_on_isolate = true;
+}
+
+static int mount_arm_timer(Mount *m, bool relative, usec_t usec) {
+ assert(m);
+
+ return unit_arm_timer(UNIT(m), &m->timer_event_source, relative, usec, mount_dispatch_timer);
+}
+
+static void mount_unwatch_control_pid(Mount *m) {
+ assert(m);
+
+ if (!pidref_is_set(&m->control_pid))
+ return;
+
+ unit_unwatch_pidref(UNIT(m), &m->control_pid);
+ pidref_done(&m->control_pid);
+}
+
+static void mount_parameters_done(MountParameters *p) {
+ assert(p);
+
+ p->what = mfree(p->what);
+ p->options = mfree(p->options);
+ p->fstype = mfree(p->fstype);
+}
+
+static void mount_done(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ m->where = mfree(m->where);
+
+ mount_parameters_done(&m->parameters_proc_self_mountinfo);
+ mount_parameters_done(&m->parameters_fragment);
+
+ m->exec_runtime = exec_runtime_free(m->exec_runtime);
+ exec_command_done_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+ m->control_command = NULL;
+
+ mount_unwatch_control_pid(m);
+
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+}
+
+static int update_parameters_proc_self_mountinfo(
+ Mount *m,
+ const char *what,
+ const char *options,
+ const char *fstype) {
+
+ MountParameters *p;
+ int r, q, w;
+
+ p = &m->parameters_proc_self_mountinfo;
+
+ r = free_and_strdup(&p->what, what);
+ if (r < 0)
+ return r;
+
+ q = free_and_strdup(&p->options, options);
+ if (q < 0)
+ return q;
+
+ w = free_and_strdup(&p->fstype, fstype);
+ if (w < 0)
+ return w;
+
+ return r > 0 || q > 0 || w > 0;
+}
+
+static int mount_add_mount_dependencies(Mount *m) {
+ MountParameters *pm;
+ Unit *other;
+ Set *s;
+ int r;
+
+ assert(m);
+
+ if (!path_equal(m->where, "/")) {
+ _cleanup_free_ char *parent = NULL;
+
+ /* Adds in links to other mount points that might lie further up in the hierarchy */
+
+ r = path_extract_directory(m->where, &parent);
+ if (r < 0)
+ return r;
+
+ r = unit_require_mounts_for(UNIT(m), parent, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+ }
+
+ /* Adds in dependencies to other mount points that might be needed for the source path (if this is a bind mount
+ * or a loop mount) to be available. */
+ pm = get_mount_parameters_fragment(m);
+ if (pm && pm->what &&
+ path_is_absolute(pm->what) &&
+ (mount_is_bind(pm) || mount_is_loop(pm) || !mount_is_network(pm))) {
+
+ r = unit_require_mounts_for(UNIT(m), pm->what, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ /* Adds in dependencies to other units that use this path or paths further down in the hierarchy */
+ s = manager_get_units_requiring_mounts_for(UNIT(m)->manager, m->where);
+ SET_FOREACH(other, s) {
+
+ if (other->load_state != UNIT_LOADED)
+ continue;
+
+ if (other == UNIT(m))
+ continue;
+
+ r = unit_add_dependency(other, UNIT_AFTER, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+ if (r < 0)
+ return r;
+
+ if (UNIT(m)->fragment_path) {
+ /* If we have fragment configuration, then make this dependency required */
+ r = unit_add_dependency(other, UNIT_REQUIRES, UNIT(m), true, UNIT_DEPENDENCY_PATH);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int mount_add_device_dependencies(Mount *m) {
+ UnitDependencyMask mask;
+ MountParameters *p;
+ UnitDependency dep;
+ int r;
+
+ assert(m);
+
+ log_unit_trace(UNIT(m), "Processing implicit device dependencies");
+
+ p = get_mount_parameters(m);
+ if (!p) {
+ log_unit_trace(UNIT(m), "Missing mount parameters, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (!p->what) {
+ log_unit_trace(UNIT(m), "Missing mount source, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (mount_is_bind(p)) {
+ log_unit_trace(UNIT(m), "Mount unit is a bind mount, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (!is_device_path(p->what)) {
+ log_unit_trace(UNIT(m), "Mount source is not a device path, skipping implicit device dependencies");
+ return 0;
+ }
+
+ /* /dev/root is a really weird thing, it's not a real device, but just a path the kernel exports for
+ * the root file system specified on the kernel command line. Ignore it here. */
+ if (PATH_IN_SET(p->what, "/dev/root", "/dev/nfs")) {
+ log_unit_trace(UNIT(m), "Mount source is in /dev/root or /dev/nfs, skipping implicit device dependencies");
+ return 0;
+ }
+
+ if (path_equal(m->where, "/")) {
+ log_unit_trace(UNIT(m), "Mount destination is '/', skipping implicit device dependencies");
+ return 0;
+ }
+
+ /* Mount units from /proc/self/mountinfo are not bound to devices by default since they're subject to
+ * races when mounts are established by other tools with different backing devices than what we
+ * maintain. The user can still force this to be a BindsTo= dependency with an appropriate option (or
+ * udev property) so the mount units are automatically stopped when the device disappears
+ * suddenly. */
+ dep = mount_is_bound_to_device(m) > 0 ? UNIT_BINDS_TO : UNIT_REQUIRES;
+
+ /* We always use 'what' from /proc/self/mountinfo if mounted */
+ mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+ r = unit_add_node_dependency(UNIT(m), p->what, dep, mask);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(dep), p->what);
+
+ if (mount_propagate_stop(m)) {
+ r = unit_add_node_dependency(UNIT(m), p->what, UNIT_STOP_PROPAGATED_FROM, mask);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s",
+ unit_dependency_to_string(UNIT_STOP_PROPAGATED_FROM), p->what);
+ }
+
+ r = unit_add_blockdev_dependency(UNIT(m), p->what, mask);
+ if (r > 0)
+ log_unit_trace(UNIT(m), "Added %s dependency on %s", unit_dependency_to_string(UNIT_AFTER), p->what);
+
+ return 0;
+}
+
+static int mount_add_quota_dependencies(Mount *m) {
+ MountParameters *p;
+ int r;
+
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(UNIT(m)->manager))
+ return 0;
+
+ p = get_mount_parameters_fragment(m);
+ if (!p)
+ return 0;
+
+ if (!mount_needs_quota(p))
+ return 0;
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTACHECK_SERVICE,
+ /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_WANTS, SPECIAL_QUOTAON_SERVICE,
+ /* add_reference= */true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static bool mount_is_extrinsic(Unit *u) {
+ MountParameters *p;
+ Mount *m = MOUNT(u);
+ assert(m);
+
+ /* Returns true for all units that are "magic" and should be excluded from the usual
+ * start-up and shutdown dependencies. We call them "extrinsic" here, as they are generally
+ * mounted outside of the systemd dependency logic. We shouldn't attempt to manage them
+ * ourselves but it's fine if the user operates on them with us. */
+
+ /* We only automatically manage mounts if we are in system mode */
+ if (MANAGER_IS_USER(u->manager))
+ return true;
+
+ p = get_mount_parameters(m);
+ if (p && fstab_is_extrinsic(m->where, p->options))
+ return true;
+
+ return false;
+}
+
+static bool mount_is_credentials(Mount *m) {
+ const char *e;
+
+ assert(m);
+
+ /* Returns true if this is a credentials mount. We don't want automatic dependencies on credential
+ * mounts, since they are managed by us for even the earliest services, and we never want anything to
+ * be ordered before them hence. */
+
+ e = path_startswith(m->where, UNIT(m)->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+ if (!e)
+ return false;
+
+ return !isempty(path_startswith(e, "credentials"));
+}
+
+static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+ const char *after, *before, *e;
+ int r;
+
+ assert(m);
+
+ e = path_startswith(m->where, "/sysroot");
+ if (e && in_initrd()) {
+ /* All mounts under /sysroot need to happen later, at initrd-fs.target time. IOW,
+ * it's not technically part of the basic initrd filesystem itself, and so
+ * shouldn't inherit the default Before=local-fs.target dependency. However,
+ * these mounts still need to start after local-fs-pre.target, as a sync point
+ * for things like systemd-hibernate-resume.service that should start before
+ * any mounts. */
+
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = isempty(e) ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_INITRD_FS_TARGET;
+
+ } else if (in_initrd() && path_startswith(m->where, "/sysusr/usr")) {
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = SPECIAL_INITRD_USR_FS_TARGET;
+
+ } else if (mount_is_credentials(m))
+ after = before = NULL;
+
+ else if (mount_is_network(p)) {
+ after = SPECIAL_REMOTE_FS_PRE_TARGET;
+ before = SPECIAL_REMOTE_FS_TARGET;
+
+ } else {
+ after = SPECIAL_LOCAL_FS_PRE_TARGET;
+ before = SPECIAL_LOCAL_FS_TARGET;
+ }
+
+ if (before && !mount_is_nofail(m)) {
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ if (after) {
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+
+ /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */
+ if (streq_ptr(p->fstype, "tmpfs") && !mount_is_credentials(m)) {
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int mount_add_default_network_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+ int r;
+
+ assert(m);
+
+ if (!mount_is_network(p))
+ return 0;
+
+ /* We order ourselves after network.target. This is primarily useful at shutdown: services that take
+ * down the network should order themselves before network.target, so that they are shut down only
+ * after this mount unit is stopped. */
+
+ r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET,
+ /* add_reference= */ true, mask);
+ if (r < 0)
+ return r;
+
+ /* We pull in network-online.target, and order ourselves after it. This is useful at start-up to
+ * actively pull in tools that want to be started before we start mounting network file systems, and
+ * whose purpose it is to delay this until the network is "up". */
+
+ return unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET,
+ /* add_reference= */ true, mask);
+}
+
+static int mount_add_default_dependencies(Mount *m) {
+ UnitDependencyMask mask;
+ MountParameters *p;
+ int r;
+
+ assert(m);
+
+ if (!UNIT(m)->default_dependencies)
+ return 0;
+
+ /* We do not add any default dependencies to /, /usr or /run/initramfs/, since they are
+ * guaranteed to stay mounted the whole time, since our system is on it. Also, don't
+ * bother with anything mounted below virtual file systems, it's also going to be virtual,
+ * and hence not worth the effort. */
+ if (mount_is_extrinsic(UNIT(m)))
+ return 0;
+
+ p = get_mount_parameters(m);
+ if (!p)
+ return 0;
+
+ mask = m->from_proc_self_mountinfo ? UNIT_DEPENDENCY_MOUNTINFO : UNIT_DEPENDENCY_MOUNT_FILE;
+
+ r = mount_add_default_ordering_dependencies(m, p, mask);
+ if (r < 0)
+ return r;
+
+ r = mount_add_default_network_dependencies(m, p, mask);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int mount_verify(Mount *m) {
+ _cleanup_free_ char *e = NULL;
+ MountParameters *p;
+ int r;
+
+ assert(m);
+ assert(UNIT(m)->load_state == UNIT_LOADED);
+
+ if (!m->from_fragment && !m->from_proc_self_mountinfo && !UNIT(m)->perpetual)
+ return -ENOENT;
+
+ r = unit_name_from_path(m->where, ".mount", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(m), r, "Failed to generate unit name from mount path: %m");
+
+ if (!unit_has_name(UNIT(m), e))
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Where= setting doesn't match unit name. Refusing.");
+
+ if (mount_point_is_api(m->where) || mount_point_ignore(m->where))
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Cannot create mount unit for API file system %s. Refusing.", m->where);
+
+ p = get_mount_parameters_fragment(m);
+ if (p && !p->what && !UNIT(m)->perpetual)
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC),
+ "What= setting is missing. Refusing.");
+
+ if (m->exec_context.pam_name && m->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(m), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to control-group'. Refusing.");
+
+ return 0;
+}
+
+static int mount_add_non_exec_dependencies(Mount *m) {
+ int r;
+
+ assert(m);
+
+ /* We may be called due to this mount appearing in /proc/self/mountinfo, hence we clear all existing
+ * dependencies that were initialized from the unit file but whose final value really depends on the
+ * content of /proc/self/mountinfo. Some (such as m->where) might have become stale now. */
+ unit_remove_dependencies(UNIT(m), UNIT_DEPENDENCY_MOUNTINFO | UNIT_DEPENDENCY_MOUNT_FILE);
+
+ if (!m->where)
+ return 0;
+
+ /* Adds in all dependencies directly responsible for ordering the mount, as opposed to dependencies
+ * resulting from the ExecContext and such. */
+
+ r = mount_add_device_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_mount_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_quota_dependencies(m);
+ if (r < 0)
+ return r;
+
+ r = mount_add_default_dependencies(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int mount_add_extras(Mount *m) {
+ Unit *u = UNIT(m);
+ int r;
+
+ assert(m);
+
+ /* Note: this call might be called after we already have been loaded once (and even when it has already been
+ * activated), in case data from /proc/self/mountinfo has changed. This means all code here needs to be ready
+ * to run with an already set up unit. */
+
+ if (u->fragment_path)
+ m->from_fragment = true;
+
+ if (!m->where) {
+ r = unit_name_to_path(u->id, &m->where);
+ if (r == -ENAMETOOLONG)
+ log_unit_error_errno(u, r, "Failed to derive mount point path from unit name, because unit name is hashed. "
+ "Set \"Where=\" in the unit file explicitly.");
+ if (r < 0)
+ return r;
+ }
+
+ path_simplify(m->where);
+
+ if (!u->description) {
+ r = unit_set_description(u, m->where);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(u, &m->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(u);
+ if (r < 0)
+ return r;
+
+ r = mount_add_non_exec_dependencies(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static void mount_load_root_mount(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_ROOT_MOUNT))
+ return;
+
+ u->perpetual = true;
+ u->default_dependencies = false;
+
+ /* The stdio/kmsg bridge socket is on /, in order to avoid a dep loop, don't use kmsg logging for -.mount */
+ MOUNT(u)->exec_context.std_output = EXEC_OUTPUT_NULL;
+ MOUNT(u)->exec_context.std_input = EXEC_INPUT_NULL;
+
+ if (!u->description)
+ u->description = strdup("Root Mount");
+}
+
+static int mount_load(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r, q = 0;
+
+ assert(m);
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ mount_load_root_mount(u);
+
+ bool fragment_optional = m->from_proc_self_mountinfo || u->perpetual;
+ r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+ /* Add in some extras. Note we do this in all cases (even if we failed to load the unit) when announced by the
+ * kernel, because we need some things to be set up no matter what when the kernel establishes a mount and thus
+ * we need to update the state in our unit to track it. After all, consider that we don't allow changing the
+ * 'slice' field for a unit once it is active. */
+ if (u->load_state == UNIT_LOADED || m->from_proc_self_mountinfo || u->perpetual)
+ q = mount_add_extras(m);
+
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ return mount_verify(m);
+}
+
+static void mount_set_state(Mount *m, MountState state) {
+ MountState old_state;
+ assert(m);
+
+ if (m->state != state)
+ bus_unit_send_pending_change_signal(UNIT(m), false);
+
+ old_state = m->state;
+ m->state = state;
+
+ if (!MOUNT_STATE_WITH_PROCESS(state)) {
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+ mount_unwatch_control_pid(m);
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(m), "Changed %s -> %s", mount_state_to_string(old_state), mount_state_to_string(state));
+
+ unit_notify(UNIT(m), state_translation_table[old_state], state_translation_table[state], m->reload_result == MOUNT_SUCCESS);
+}
+
+static int mount_coldplug(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(m->state == MOUNT_DEAD);
+
+ if (m->deserialized_state == m->state)
+ return 0;
+
+ if (pidref_is_set(&m->control_pid) &&
+ pidref_is_unwaited(&m->control_pid) > 0 &&
+ MOUNT_STATE_WITH_PROCESS(m->deserialized_state)) {
+
+ r = unit_watch_pidref(UNIT(m), &m->control_pid, /* exclusive= */ false);
+ if (r < 0)
+ return r;
+
+ r = mount_arm_timer(m, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, m->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(m->deserialized_state, MOUNT_DEAD, MOUNT_FAILED))
+ (void) unit_setup_exec_runtime(u);
+
+ mount_set_state(m, m->deserialized_state);
+ return 0;
+}
+
+static void mount_catchup(Unit *u) {
+ Mount *m = MOUNT(ASSERT_PTR(u));
+
+ assert(m);
+
+ /* Adjust the deserialized state. See comments in mount_process_proc_self_mountinfo(). */
+ if (m->from_proc_self_mountinfo)
+ switch (m->state) {
+ case MOUNT_DEAD:
+ case MOUNT_FAILED:
+ assert(!pidref_is_set(&m->control_pid));
+ (void) unit_acquire_invocation_id(u);
+ mount_cycle_clear(m);
+ mount_enter_mounted(m, MOUNT_SUCCESS);
+ break;
+ case MOUNT_MOUNTING:
+ assert(pidref_is_set(&m->control_pid));
+ mount_set_state(m, MOUNT_MOUNTING_DONE);
+ break;
+ default:
+ break;
+ }
+ else
+ switch (m->state) {
+ case MOUNT_MOUNTING_DONE:
+ assert(pidref_is_set(&m->control_pid));
+ mount_set_state(m, MOUNT_MOUNTING);
+ break;
+ case MOUNT_MOUNTED:
+ assert(!pidref_is_set(&m->control_pid));
+ mount_enter_dead(m, MOUNT_SUCCESS);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mount_dump(Unit *u, FILE *f, const char *prefix) {
+ Mount *m = MOUNT(u);
+ MountParameters *p;
+
+ assert(m);
+ assert(f);
+
+ p = get_mount_parameters(m);
+
+ fprintf(f,
+ "%sMount State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sWhere: %s\n"
+ "%sWhat: %s\n"
+ "%sFile System Type: %s\n"
+ "%sOptions: %s\n"
+ "%sFrom /proc/self/mountinfo: %s\n"
+ "%sFrom fragment: %s\n"
+ "%sExtrinsic: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sSloppyOptions: %s\n"
+ "%sLazyUnmount: %s\n"
+ "%sForceUnmount: %s\n"
+ "%sReadWriteOnly: %s\n"
+ "%sTimeoutSec: %s\n",
+ prefix, mount_state_to_string(m->state),
+ prefix, mount_result_to_string(m->result),
+ prefix, mount_result_to_string(m->clean_result),
+ prefix, m->where,
+ prefix, p ? strna(p->what) : "n/a",
+ prefix, p ? strna(p->fstype) : "n/a",
+ prefix, p ? strna(p->options) : "n/a",
+ prefix, yes_no(m->from_proc_self_mountinfo),
+ prefix, yes_no(m->from_fragment),
+ prefix, yes_no(mount_is_extrinsic(u)),
+ prefix, m->directory_mode,
+ prefix, yes_no(m->sloppy_options),
+ prefix, yes_no(m->lazy_unmount),
+ prefix, yes_no(m->force_unmount),
+ prefix, yes_no(m->read_write_only),
+ prefix, FORMAT_TIMESPAN(m->timeout_usec, USEC_PER_SEC));
+
+ if (pidref_is_set(&m->control_pid))
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, m->control_pid.pid);
+
+ exec_context_dump(&m->exec_context, f, prefix);
+ kill_context_dump(&m->kill_context, f, prefix);
+ cgroup_context_dump(UNIT(m), f, prefix);
+}
+
+static int mount_spawn(Mount *m, ExecCommand *c, PidRef *ret_pid) {
+
+ _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ pid_t pid;
+ int r;
+
+ assert(m);
+ assert(c);
+ assert(ret_pid);
+
+ r = unit_prepare_exec(UNIT(m));
+ if (r < 0)
+ return r;
+
+ r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec);
+ if (r < 0)
+ return r;
+
+ r = unit_set_exec_params(UNIT(m), &exec_params);
+ if (r < 0)
+ return r;
+
+ r = exec_spawn(UNIT(m),
+ c,
+ &m->exec_context,
+ &exec_params,
+ m->exec_runtime,
+ &m->cgroup_context,
+ &pid);
+ if (r < 0)
+ return r;
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(UNIT(m), &pidref, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pidref);
+ return 0;
+}
+
+static void mount_enter_dead(Mount *m, MountResult f) {
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ unit_log_result(UNIT(m), m->result == MOUNT_SUCCESS, mount_result_to_string(m->result));
+ unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_stop);
+
+ mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD);
+
+ m->exec_runtime = exec_runtime_destroy(m->exec_runtime);
+
+ unit_destroy_runtime_data(UNIT(m), &m->exec_context);
+
+ unit_unref_uid_gid(UNIT(m), true);
+
+ /* Any dependencies based on /proc/self/mountinfo are now stale. Let's re-generate dependencies from
+ * .mount unit. */
+ (void) mount_add_non_exec_dependencies(m);
+}
+
+static void mount_enter_mounted(Mount *m, MountResult f) {
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ mount_set_state(m, MOUNT_MOUNTED);
+}
+
+static void mount_enter_dead_or_mounted(Mount *m, MountResult f) {
+ assert(m);
+
+ /* Enter DEAD or MOUNTED state, depending on what the kernel currently says about the mount point. We use this
+ * whenever we executed an operation, so that our internal state reflects what the kernel says again, after all
+ * ultimately we just mirror the kernel's internal state on this. */
+
+ if (m->from_proc_self_mountinfo)
+ mount_enter_mounted(m, f);
+ else
+ mount_enter_dead(m, f);
+}
+
+static int state_to_kill_operation(MountState state) {
+ switch (state) {
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ return KILL_RESTART;
+
+ case MOUNT_UNMOUNTING_SIGTERM:
+ return KILL_TERMINATE;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGKILL:
+ return KILL_KILL;
+
+ default:
+ return _KILL_OPERATION_INVALID;
+ }
+}
+
+static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
+ int r;
+
+ assert(m);
+
+ if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ r = unit_kill_context(
+ UNIT(m),
+ &m->kill_context,
+ state_to_kill_operation(state),
+ /* main_pid= */ NULL,
+ &m->control_pid,
+ /* main_pid_alien= */ false);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to kill processes: %m");
+ goto fail;
+ }
+
+ if (r > 0) {
+ r = mount_arm_timer(m, /* relative= */ true, m->timeout_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ mount_set_state(m, state);
+ } else if (state == MOUNT_REMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ else if (IN_SET(state, MOUNT_REMOUNTING_SIGTERM, MOUNT_REMOUNTING_SIGKILL))
+ mount_enter_mounted(m, MOUNT_SUCCESS);
+ else if (state == MOUNT_UNMOUNTING_SIGTERM && m->kill_context.send_sigkill)
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ else
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+
+ return;
+
+fail:
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static int mount_set_umount_command(Mount *m, ExecCommand *c) {
+ int r;
+
+ assert(m);
+ assert(c);
+
+ r = exec_command_set(c, UMOUNT_PATH, m->where, "-c", NULL);
+ if (r < 0)
+ return r;
+
+ if (m->lazy_unmount) {
+ r = exec_command_append(c, "-l", NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (m->force_unmount) {
+ r = exec_command_append(c, "-f", NULL);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void mount_enter_unmounting(Mount *m) {
+ int r;
+
+ assert(m);
+
+ /* Start counting our attempts */
+ if (!IN_SET(m->state,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL))
+ m->n_retry_umount = 0;
+
+ m->control_command_id = MOUNT_EXEC_UNMOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_UNMOUNT;
+
+ r = mount_set_umount_command(m, m->control_command);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to prepare umount command line: %m");
+ goto fail;
+ }
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'umount' task: %m");
+ goto fail;
+ }
+
+ mount_set_state(m, MOUNT_UNMOUNTING);
+
+ return;
+
+fail:
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static int mount_set_mount_command(Mount *m, ExecCommand *c, const MountParameters *p) {
+ int r;
+
+ assert(m);
+ assert(c);
+ assert(p);
+
+ r = exec_command_set(c, MOUNT_PATH, p->what, m->where, NULL);
+ if (r < 0)
+ return r;
+
+ if (m->sloppy_options) {
+ r = exec_command_append(c, "-s", NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (m->read_write_only) {
+ r = exec_command_append(c, "-w", NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->fstype) {
+ r = exec_command_append(c, "-t", p->fstype, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ _cleanup_free_ char *opts = NULL;
+ r = fstab_filter_options(p->options, "nofail\0" "noauto\0" "auto\0", NULL, NULL, NULL, &opts);
+ if (r < 0)
+ return r;
+
+ if (!isempty(opts)) {
+ r = exec_command_append(c, "-o", opts, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void mount_enter_mounting(Mount *m) {
+ int r;
+ MountParameters *p;
+ bool source_is_dir = true;
+
+ assert(m);
+
+ r = unit_fail_if_noncanonical(UNIT(m), m->where);
+ if (r < 0)
+ goto fail;
+
+ p = get_mount_parameters_fragment(m);
+ if (p && mount_is_bind(p)) {
+ r = is_dir(p->what, /* follow = */ true);
+ if (r < 0 && r != -ENOENT)
+ log_unit_info_errno(UNIT(m), r, "Failed to determine type of bind mount source '%s', ignoring: %m", p->what);
+ else if (r == 0)
+ source_is_dir = false;
+ }
+
+ if (source_is_dir)
+ r = mkdir_p_label(m->where, m->directory_mode);
+ else
+ r = touch_file(m->where, /* parents = */ true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+ if (r < 0 && r != -EEXIST)
+ log_unit_warning_errno(UNIT(m), r, "Failed to create mount point '%s', ignoring: %m", m->where);
+
+ if (source_is_dir)
+ unit_warn_if_dir_nonempty(UNIT(m), m->where);
+ unit_warn_leftover_processes(UNIT(m), unit_log_leftover_process_start);
+
+ m->control_command_id = MOUNT_EXEC_MOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_MOUNT;
+
+ /* Create the source directory for bind-mounts if needed */
+ if (p && mount_is_bind(p)) {
+ r = mkdir_p_label(p->what, m->directory_mode);
+ /* mkdir_p_label() can return -EEXIST if the target path exists and is not a directory - which is
+ * totally OK, in case the user wants us to overmount a non-directory inode. Also -EROFS can be
+ * returned on read-only filesystem. Moreover, -EACCES (and also maybe -EPERM?) may be returned
+ * when the path is on NFS. See issue #24120. All such errors will be logged in the debug level. */
+ if (r < 0 && r != -EEXIST)
+ log_unit_full_errno(UNIT(m),
+ (r == -EROFS || ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to make bind mount source '%s', ignoring: %m", p->what);
+ }
+
+ if (p) {
+ r = mount_set_mount_command(m, m->control_command, p);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to prepare mount command line: %m");
+ goto fail;
+ }
+ } else {
+ r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on.");
+ goto fail;
+ }
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'mount' task: %m");
+ goto fail;
+ }
+
+ mount_set_state(m, MOUNT_MOUNTING);
+ return;
+
+fail:
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_RESOURCES);
+}
+
+static void mount_set_reload_result(Mount *m, MountResult result) {
+ assert(m);
+
+ /* Only store the first error we encounter */
+ if (m->reload_result != MOUNT_SUCCESS)
+ return;
+
+ m->reload_result = result;
+}
+
+static void mount_enter_remounting(Mount *m) {
+ int r;
+ MountParameters *p;
+
+ assert(m);
+
+ /* Reset reload result when we are about to start a new remount operation */
+ m->reload_result = MOUNT_SUCCESS;
+
+ m->control_command_id = MOUNT_EXEC_REMOUNT;
+ m->control_command = m->exec_command + MOUNT_EXEC_REMOUNT;
+
+ p = get_mount_parameters_fragment(m);
+ if (p) {
+ const char *o;
+
+ if (p->options)
+ o = strjoina("remount,", p->options);
+ else
+ o = "remount";
+
+ r = exec_command_set(m->control_command, MOUNT_PATH,
+ p->what, m->where,
+ "-o", o, NULL);
+ if (r >= 0 && m->sloppy_options)
+ r = exec_command_append(m->control_command, "-s", NULL);
+ if (r >= 0 && m->read_write_only)
+ r = exec_command_append(m->control_command, "-w", NULL);
+ if (r >= 0 && p->fstype)
+ r = exec_command_append(m->control_command, "-t", p->fstype, NULL);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to prepare remount command line: %m");
+ goto fail;
+ }
+
+ } else {
+ r = log_unit_warning_errno(UNIT(m), SYNTHETIC_ERRNO(ENOENT), "No mount parameters to operate on.");
+ goto fail;
+ }
+
+ mount_unwatch_control_pid(m);
+
+ r = mount_spawn(m, m->control_command, &m->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(m), r, "Failed to spawn 'remount' task: %m");
+ goto fail;
+ }
+
+ mount_set_state(m, MOUNT_REMOUNTING);
+ return;
+
+fail:
+ mount_set_reload_result(m, MOUNT_FAILURE_RESOURCES);
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+}
+
+static void mount_cycle_clear(Mount *m) {
+ assert(m);
+
+ /* Clear all state we shall forget for this new cycle */
+
+ m->result = MOUNT_SUCCESS;
+ m->reload_result = MOUNT_SUCCESS;
+ exec_command_reset_status_array(m->exec_command, _MOUNT_EXEC_COMMAND_MAX);
+ UNIT(m)->reset_accounting = true;
+}
+
+static int mount_start(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(m->state,
+ MOUNT_UNMOUNTING,
+ MOUNT_UNMOUNTING_SIGTERM,
+ MOUNT_UNMOUNTING_SIGKILL,
+ MOUNT_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(m->state, MOUNT_MOUNTING, MOUNT_MOUNTING_DONE))
+ return 0;
+
+ assert(IN_SET(m->state, MOUNT_DEAD, MOUNT_FAILED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ mount_cycle_clear(m);
+ mount_enter_mounting(m);
+
+ return 1;
+}
+
+static int mount_stop(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ /* When we directly call umount() for a path, then the state of the corresponding mount unit may be
+ * outdated. Let's re-read mountinfo now and update the state. */
+ if (m->invalidated_state)
+ (void) mount_process_proc_self_mountinfo(u->manager);
+
+ switch (m->state) {
+
+ case MOUNT_UNMOUNTING:
+ case MOUNT_UNMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGTERM:
+ /* Already on it */
+ return 0;
+
+ case MOUNT_MOUNTING:
+ case MOUNT_MOUNTING_DONE:
+ case MOUNT_REMOUNTING:
+ /* If we are still waiting for /bin/mount, we go directly into kill mode. */
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_SUCCESS);
+ return 0;
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ /* If we are already waiting for a hung remount, convert this to the matching unmounting state */
+ mount_set_state(m, MOUNT_UNMOUNTING_SIGTERM);
+ return 0;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ /* as above */
+ mount_set_state(m, MOUNT_UNMOUNTING_SIGKILL);
+ return 0;
+
+ case MOUNT_MOUNTED:
+ mount_enter_unmounting(m);
+ return 1;
+
+ case MOUNT_CLEANING:
+ /* If we are currently cleaning, then abort it, brutally. */
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ return 0;
+
+ case MOUNT_DEAD:
+ case MOUNT_FAILED:
+ /* The mount has just been unmounted by somebody else. */
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int mount_reload(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(m->state == MOUNT_MOUNTED);
+
+ mount_enter_remounting(m);
+
+ return 1;
+}
+
+static int mount_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", mount_state_to_string(m->state));
+ (void) serialize_item(f, "result", mount_result_to_string(m->result));
+ (void) serialize_item(f, "reload-result", mount_result_to_string(m->reload_result));
+ (void) serialize_item_format(f, "n-retry-umount", "%u", m->n_retry_umount);
+ (void) serialize_pidref(f, fds, "control-pid", &m->control_pid);
+
+ if (m->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", mount_exec_command_to_string(m->control_command_id));
+
+ return 0;
+}
+
+static int mount_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ MountState state;
+
+ state = mount_state_from_string(value);
+ if (state < 0)
+ log_unit_debug_errno(u, state, "Failed to parse state value: %s", value);
+ else
+ m->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ MountResult f;
+
+ f = mount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug_errno(u, f, "Failed to parse result value: %s", value);
+ else if (f != MOUNT_SUCCESS)
+ m->result = f;
+
+ } else if (streq(key, "reload-result")) {
+ MountResult f;
+
+ f = mount_result_from_string(value);
+ if (f < 0)
+ log_unit_debug_errno(u, f, "Failed to parse reload result value: %s", value);
+ else if (f != MOUNT_SUCCESS)
+ m->reload_result = f;
+
+ } else if (streq(key, "n-retry-umount")) {
+
+ r = safe_atou(value, &m->n_retry_umount);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse n-retry-umount value: %s", value);
+
+ } else if (streq(key, "control-pid")) {
+
+ pidref_done(&m->control_pid);
+ (void) deserialize_pidref(fds, value, &m->control_pid);
+
+ } else if (streq(key, "control-command")) {
+ MountExecCommand id;
+
+ id = mount_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug_errno(u, id, "Failed to parse exec-command value: %s", value);
+ else {
+ m->control_command_id = id;
+ m->control_command = m->exec_command + id;
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState mount_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[MOUNT(u)->state];
+}
+
+static const char *mount_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return mount_state_to_string(MOUNT(u)->state);
+}
+
+static bool mount_may_gc(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ if (m->from_proc_self_mountinfo)
+ return false;
+
+ return true;
+}
+
+static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Mount *m = MOUNT(u);
+ MountResult f;
+
+ assert(m);
+ assert(pid >= 0);
+
+ if (pid != m->control_pid.pid)
+ return;
+
+ /* So here's the thing, we really want to know before /usr/bin/mount or /usr/bin/umount exit whether
+ * they established/remove a mount. This is important when mounting, but even more so when unmounting
+ * since we need to deal with nested mounts and otherwise cannot safely determine whether to repeat
+ * the unmounts. In theory, the kernel fires /proc/self/mountinfo changes off before returning from
+ * the mount() or umount() syscalls, and thus we should see the changes to the proc file before we
+ * process the waitid() for the /usr/bin/(u)mount processes. However, this is unfortunately racy: we
+ * have to waitid() for processes using P_ALL (since we need to reap unexpected children that got
+ * reparented to PID 1), but when using P_ALL we might end up reaping processes that terminated just
+ * instants ago, i.e. already after our last event loop iteration (i.e. after the last point we might
+ * have noticed /proc/self/mountinfo events via epoll). This means event loop priorities for
+ * processing SIGCHLD vs. /proc/self/mountinfo IO events are not as relevant as we want. To fix that
+ * race, let's explicitly scan /proc/self/mountinfo before we start processing /usr/bin/(u)mount
+ * dying. It's ugly, but it makes our ordering systematic again, and makes sure we always see
+ * /proc/self/mountinfo changes before our mount/umount exits. */
+ (void) mount_process_proc_self_mountinfo(u->manager);
+
+ pidref_done(&m->control_pid);
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = MOUNT_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = MOUNT_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = MOUNT_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = MOUNT_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (IN_SET(m->state, MOUNT_REMOUNTING, MOUNT_REMOUNTING_SIGKILL, MOUNT_REMOUNTING_SIGTERM))
+ mount_set_reload_result(m, f);
+ else if (m->result == MOUNT_SUCCESS)
+ m->result = f;
+
+ if (m->control_command) {
+ exec_status_exit(&m->control_command->exec_status, &m->exec_context, pid, code, status);
+
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Mount process",
+ mount_exec_command_to_string(m->control_command_id),
+ f == MOUNT_SUCCESS,
+ code, status);
+
+ /* Note that due to the io event priority logic, we can be sure the new mountinfo is loaded
+ * before we process the SIGCHLD for the mount command. */
+
+ switch (m->state) {
+
+ case MOUNT_MOUNTING:
+ /* Our mount point has not appeared in mountinfo. Something went wrong. */
+
+ if (f == MOUNT_SUCCESS) {
+ /* Either /bin/mount has an unexpected definition of success,
+ * or someone raced us and we lost. */
+ log_unit_warning(UNIT(m), "Mount process finished, but there is no mount.");
+ f = MOUNT_FAILURE_PROTOCOL;
+ }
+ mount_enter_dead(m, f);
+ break;
+
+ case MOUNT_MOUNTING_DONE:
+ mount_enter_mounted(m, f);
+ break;
+
+ case MOUNT_REMOUNTING:
+ case MOUNT_REMOUNTING_SIGTERM:
+ case MOUNT_REMOUNTING_SIGKILL:
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_UNMOUNTING:
+
+ if (f == MOUNT_SUCCESS && m->from_proc_self_mountinfo) {
+
+ /* Still a mount point? If so, let's try again. Most likely there were multiple mount points
+ * stacked on top of each other. We might exceed the timeout specified by the user overall,
+ * but we will stop as soon as any one umount times out. */
+
+ if (m->n_retry_umount < RETRY_UMOUNT_MAX) {
+ log_unit_debug(u, "Mount still present, trying again.");
+ m->n_retry_umount++;
+ mount_enter_unmounting(m);
+ } else {
+ log_unit_warning(u, "Mount still present after %u attempts to unmount, giving up.", m->n_retry_umount);
+ mount_enter_mounted(m, f);
+ }
+ } else
+ mount_enter_dead_or_mounted(m, f);
+
+ break;
+
+ case MOUNT_UNMOUNTING_SIGKILL:
+ case MOUNT_UNMOUNTING_SIGTERM:
+ mount_enter_dead_or_mounted(m, f);
+ break;
+
+ case MOUNT_CLEANING:
+ if (m->clean_result == MOUNT_SUCCESS)
+ m->clean_result = f;
+
+ mount_enter_dead(m, MOUNT_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Mount *m = MOUNT(userdata);
+
+ assert(m);
+ assert(m->timer_event_source == source);
+
+ switch (m->state) {
+
+ case MOUNT_MOUNTING:
+ case MOUNT_MOUNTING_DONE:
+ log_unit_warning(UNIT(m), "Mounting timed out. Terminating.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_REMOUNTING:
+ log_unit_warning(UNIT(m), "Remounting timed out. Terminating remount process.");
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGTERM, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_REMOUNTING_SIGTERM:
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+ if (m->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(m), "Remounting timed out. Killing.");
+ mount_enter_signal(m, MOUNT_REMOUNTING_SIGKILL, MOUNT_SUCCESS);
+ } else {
+ log_unit_warning(UNIT(m), "Remounting timed out. Skipping SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ }
+ break;
+
+ case MOUNT_REMOUNTING_SIGKILL:
+ mount_set_reload_result(m, MOUNT_FAILURE_TIMEOUT);
+
+ log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_UNMOUNTING:
+ log_unit_warning(UNIT(m), "Unmounting timed out. Terminating.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGTERM, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_UNMOUNTING_SIGTERM:
+ if (m->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(m), "Mount process timed out. Killing.");
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, MOUNT_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(m), "Mount process timed out. Skipping SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case MOUNT_UNMOUNTING_SIGKILL:
+ log_unit_warning(UNIT(m), "Mount process still around after SIGKILL. Ignoring.");
+ mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+ break;
+
+ case MOUNT_CLEANING:
+ log_unit_warning(UNIT(m), "Cleaning timed out. killing.");
+
+ if (m->clean_result == MOUNT_SUCCESS)
+ m->clean_result = MOUNT_FAILURE_TIMEOUT;
+
+ mount_enter_signal(m, MOUNT_UNMOUNTING_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int mount_setup_new_unit(
+ Manager *m,
+ const char *name,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ MountProcFlags *ret_flags,
+ Unit **ret) {
+
+ _cleanup_(unit_freep) Unit *u = NULL;
+ int r;
+
+ assert(m);
+ assert(name);
+ assert(ret_flags);
+ assert(ret);
+
+ r = unit_new_for_name(m, sizeof(Mount), name, &u);
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&u->source_path, "/proc/self/mountinfo");
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&MOUNT(u)->where, where);
+ if (r < 0)
+ return r;
+
+ r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ if (r < 0)
+ return r;
+
+ /* This unit was generated because /proc/self/mountinfo reported it. Remember this, so that by the
+ * time we load the unit file for it (and thus add in extra deps right after) we know what source to
+ * attributes the deps to. */
+ MOUNT(u)->from_proc_self_mountinfo = true;
+
+ r = mount_add_non_exec_dependencies(MOUNT(u));
+ if (r < 0)
+ return r;
+
+ /* We have only allocated the stub now, let's enqueue this unit for loading now, so that everything
+ * else is loaded in now. */
+ unit_add_to_load_queue(u);
+
+ *ret_flags = MOUNT_PROC_IS_MOUNTED | MOUNT_PROC_JUST_MOUNTED | MOUNT_PROC_JUST_CHANGED;
+ *ret = TAKE_PTR(u);
+ return 0;
+}
+
+static int mount_setup_existing_unit(
+ Unit *u,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ MountProcFlags *ret_flags) {
+
+ int r;
+
+ assert(u);
+ assert(ret_flags);
+
+ if (!MOUNT(u)->where) {
+ MOUNT(u)->where = strdup(where);
+ if (!MOUNT(u)->where)
+ return -ENOMEM;
+ }
+
+ /* In case we have multiple mounts established on the same mount point, let's merge flags set already
+ * for the current unit. Note that the flags field is reset on each iteration of reading
+ * /proc/self/mountinfo, hence we know for sure anything already set here is from the current
+ * iteration and thus worthy of taking into account. */
+ MountProcFlags flags =
+ MOUNT(u)->proc_flags | MOUNT_PROC_IS_MOUNTED;
+
+ r = update_parameters_proc_self_mountinfo(MOUNT(u), what, options, fstype);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ flags |= MOUNT_PROC_JUST_CHANGED;
+
+ /* There are two conditions when we consider a mount point just mounted: when we haven't seen it in
+ * /proc/self/mountinfo before or when MOUNT_MOUNTING is our current state. Why bother with the
+ * latter? Shouldn't that be covered by the former? No, during reload it is not because we might then
+ * encounter a new /proc/self/mountinfo in combination with an old mount unit state (since it stems
+ * from the serialized state), and need to catch up. Since we know that the MOUNT_MOUNTING state is
+ * reached when we wait for the mount to appear we hence can assume that if we are in it, we are
+ * actually seeing it established for the first time. */
+ if (!MOUNT(u)->from_proc_self_mountinfo || MOUNT(u)->state == MOUNT_MOUNTING)
+ flags |= MOUNT_PROC_JUST_MOUNTED;
+
+ MOUNT(u)->from_proc_self_mountinfo = true;
+
+ if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ /* The unit was previously not found or otherwise not loaded. Now that the unit shows up in
+ * /proc/self/mountinfo we should reconsider it this, hence set it to UNIT_LOADED. */
+ u->load_state = UNIT_LOADED;
+ u->load_error = 0;
+
+ flags |= MOUNT_PROC_JUST_CHANGED;
+ }
+
+ if (FLAGS_SET(flags, MOUNT_PROC_JUST_CHANGED)) {
+ /* If things changed, then make sure that all deps are regenerated. Let's
+ * first remove all automatic deps, and then add in the new ones. */
+ r = mount_add_non_exec_dependencies(MOUNT(u));
+ if (r < 0)
+ return r;
+ }
+
+ *ret_flags = flags;
+ return 0;
+}
+
+static int mount_setup_unit(
+ Manager *m,
+ const char *what,
+ const char *where,
+ const char *options,
+ const char *fstype,
+ bool set_flags) {
+
+ _cleanup_free_ char *e = NULL;
+ MountProcFlags flags;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(what);
+ assert(where);
+ assert(options);
+ assert(fstype);
+
+ /* Ignore API mount points. They should never be referenced in
+ * dependencies ever. */
+ if (mount_point_is_api(where) || mount_point_ignore(where))
+ return 0;
+
+ if (streq(fstype, "autofs"))
+ return 0;
+
+ /* probably some kind of swap, ignore */
+ if (!is_path(where))
+ return 0;
+
+ r = unit_name_from_path(where, ".mount", &e);
+ if (r < 0)
+ return log_struct_errno(
+ LOG_WARNING, r,
+ "MESSAGE_ID=" SD_MESSAGE_MOUNT_POINT_PATH_NOT_SUITABLE_STR,
+ "MOUNT_POINT=%s", where,
+ LOG_MESSAGE("Failed to generate valid unit name from mount point path '%s', ignoring mount point: %m",
+ where));
+
+ u = manager_get_unit(m, e);
+ if (u)
+ r = mount_setup_existing_unit(u, what, where, options, fstype, &flags);
+ else
+ /* First time we see this mount point meaning that it's not been initiated by a mount unit
+ * but rather by the sysadmin having called mount(8) directly. */
+ r = mount_setup_new_unit(m, e, what, where, options, fstype, &flags, &u);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to set up mount unit for '%s': %m", where);
+
+ /* If the mount changed properties or state, let's notify our clients */
+ if (flags & (MOUNT_PROC_JUST_CHANGED|MOUNT_PROC_JUST_MOUNTED))
+ unit_add_to_dbus_queue(u);
+
+ if (set_flags)
+ MOUNT(u)->proc_flags = flags;
+
+ return 0;
+}
+
+static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) {
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ int r;
+
+ assert(m);
+
+ r = libmount_parse(NULL, NULL, &table, &iter);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+ for (;;) {
+ struct libmnt_fs *fs;
+ const char *device, *path, *options, *fstype;
+
+ r = mnt_table_next_fs(table, iter, &fs);
+ if (r == 1)
+ break;
+ if (r < 0)
+ return log_error_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+ device = mnt_fs_get_source(fs);
+ path = mnt_fs_get_target(fs);
+ options = mnt_fs_get_options(fs);
+ fstype = mnt_fs_get_fstype(fs);
+
+ if (!device || !path)
+ continue;
+
+ device_found_node(m, device, DEVICE_FOUND_MOUNT, DEVICE_FOUND_MOUNT);
+
+ (void) mount_setup_unit(m, device, path, options, fstype, set_flags);
+ }
+
+ return 0;
+}
+
+static void mount_shutdown(Manager *m) {
+ assert(m);
+
+ m->mount_event_source = sd_event_source_disable_unref(m->mount_event_source);
+
+ mnt_unref_monitor(m->mount_monitor);
+ m->mount_monitor = NULL;
+}
+
+static int mount_get_timeout(Unit *u, usec_t *timeout) {
+ Mount *m = MOUNT(u);
+ usec_t t;
+ int r;
+
+ assert(m);
+ assert(u);
+
+ if (!m->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(m->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static void mount_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ /* Whatever happens, we know for sure that the root directory is around, and cannot go away. Let's
+ * unconditionally synthesize it here and mark it as perpetual. */
+
+ u = manager_get_unit(m, SPECIAL_ROOT_MOUNT);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Mount), SPECIAL_ROOT_MOUNT, &u);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate the special " SPECIAL_ROOT_MOUNT " unit: %m");
+ return;
+ }
+ }
+
+ u->perpetual = true;
+ MOUNT(u)->deserialized_state = MOUNT_MOUNTED;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+}
+
+static bool mount_is_mounted(Mount *m) {
+ assert(m);
+
+ return UNIT(m)->perpetual || FLAGS_SET(m->proc_flags, MOUNT_PROC_IS_MOUNTED);
+}
+
+static int mount_on_ratelimit_expire(sd_event_source *s, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ Job *j;
+
+ /* Let's enqueue all start jobs that were previously skipped because of active ratelimit. */
+ HASHMAP_FOREACH(j, m->jobs) {
+ if (j->unit->type != UNIT_MOUNT)
+ continue;
+
+ job_add_to_run_queue(j);
+ }
+
+ /* By entering ratelimited state we made all mount start jobs not runnable, now rate limit is over so
+ * let's make sure we dispatch them in the next iteration. */
+ manager_trigger_run_queue(m);
+
+ return 0;
+}
+
+static void mount_enumerate(Manager *m) {
+ int r;
+
+ assert(m);
+
+ mnt_init_debug(0);
+
+ if (!m->mount_monitor) {
+ unsigned mount_rate_limit_burst = 5;
+ int fd;
+
+ m->mount_monitor = mnt_new_monitor();
+ if (!m->mount_monitor) {
+ log_oom();
+ goto fail;
+ }
+
+ r = mnt_monitor_enable_kernel(m->mount_monitor, 1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable watching of kernel mount events: %m");
+ goto fail;
+ }
+
+ r = mnt_monitor_enable_userspace(m->mount_monitor, 1, NULL);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable watching of userspace mount events: %m");
+ goto fail;
+ }
+
+ /* mnt_unref_monitor() will close the fd */
+ fd = r = mnt_monitor_get_fd(m->mount_monitor);
+ if (r < 0) {
+ log_error_errno(r, "Failed to acquire watch file descriptor: %m");
+ goto fail;
+ }
+
+ r = sd_event_add_io(m->event, &m->mount_event_source, fd, EPOLLIN, mount_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to watch mount file descriptor: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_priority(m->mount_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ if (r < 0) {
+ log_error_errno(r, "Failed to adjust mount watch priority: %m");
+ goto fail;
+ }
+
+ /* Let users override the default (5 in 1s), as it stalls the boot sequence on busy systems. */
+ const char *e = secure_getenv("SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST");
+ if (e) {
+ r = safe_atou(e, &mount_rate_limit_burst);
+ if (r < 0)
+ log_debug("Invalid value in $SYSTEMD_DEFAULT_MOUNT_RATE_LIMIT_BURST, ignoring: %s", e);
+ }
+
+ r = sd_event_source_set_ratelimit(m->mount_event_source, 1 * USEC_PER_SEC, mount_rate_limit_burst);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_ratelimit_expire_callback(m->mount_event_source, mount_on_ratelimit_expire);
+ if (r < 0) {
+ log_error_errno(r, "Failed to enable rate limit for mount events: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(m->mount_event_source, "mount-monitor-dispatch");
+ }
+
+ r = mount_load_proc_self_mountinfo(m, false);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ mount_shutdown(m);
+}
+
+static int drain_libmount(Manager *m) {
+ bool rescan = false;
+ int r;
+
+ assert(m);
+
+ /* Drain all events and verify that the event is valid.
+ *
+ * Note that libmount also monitors /run/mount mkdir if the directory does not exist yet. The mkdir
+ * may generate event which is irrelevant for us.
+ *
+ * error: r < 0; valid: r == 0, false positive: r == 1 */
+ do {
+ r = mnt_monitor_next_change(m->mount_monitor, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to drain libmount events: %m");
+ if (r == 0)
+ rescan = true;
+ } while (r == 0);
+
+ return rescan;
+}
+
+static int mount_process_proc_self_mountinfo(Manager *m) {
+ _cleanup_set_free_ Set *around = NULL, *gone = NULL;
+ const char *what;
+ int r;
+
+ assert(m);
+
+ r = drain_libmount(m);
+ if (r <= 0)
+ return r;
+
+ r = mount_load_proc_self_mountinfo(m, true);
+ if (r < 0) {
+ /* Reset flags, just in case, for later calls */
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT])
+ MOUNT(u)->proc_flags = 0;
+
+ return 0;
+ }
+
+ manager_dispatch_load_queue(m);
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT]) {
+ Mount *mount = MOUNT(u);
+
+ mount->invalidated_state = false;
+
+ if (!mount_is_mounted(mount)) {
+
+ /* A mount point is not around right now. It might be gone, or might never have
+ * existed. */
+
+ if (mount->from_proc_self_mountinfo &&
+ mount->parameters_proc_self_mountinfo.what)
+ /* Remember that this device might just have disappeared */
+ if (set_put_strdup_full(&gone, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+ log_oom(); /* we don't care too much about OOM here... */
+
+ mount->from_proc_self_mountinfo = false;
+ assert_se(update_parameters_proc_self_mountinfo(mount, NULL, NULL, NULL) >= 0);
+
+ switch (mount->state) {
+
+ case MOUNT_MOUNTED:
+ /* This has just been unmounted by somebody else, follow the state change. */
+ mount_enter_dead(mount, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_MOUNTING_DONE:
+ /* The mount command may add the corresponding proc mountinfo entry and
+ * then remove it because of an internal error. E.g., fuse.sshfs seems
+ * to do that when the connection fails. See #17617. To handle such the
+ * case, let's once set the state back to mounting. Then, the unit can
+ * correctly enter the failed state later in mount_sigchld(). */
+ mount_set_state(mount, MOUNT_MOUNTING);
+ break;
+
+ default:
+ break;
+ }
+
+ } else if (mount->proc_flags & (MOUNT_PROC_JUST_MOUNTED|MOUNT_PROC_JUST_CHANGED)) {
+
+ /* A mount point was added or changed */
+
+ switch (mount->state) {
+
+ case MOUNT_DEAD:
+ case MOUNT_FAILED:
+
+ /* This has just been mounted by somebody else, follow the state change, but let's
+ * generate a new invocation ID for this implicitly and automatically. */
+ (void) unit_acquire_invocation_id(u);
+ mount_cycle_clear(mount);
+ mount_enter_mounted(mount, MOUNT_SUCCESS);
+ break;
+
+ case MOUNT_MOUNTING:
+ mount_set_state(mount, MOUNT_MOUNTING_DONE);
+ break;
+
+ default:
+ /* Nothing really changed, but let's issue an notification call nonetheless,
+ * in case somebody is waiting for this. (e.g. file system ro/rw
+ * remounts.) */
+ mount_set_state(mount, mount->state);
+ break;
+ }
+ }
+
+ if (mount_is_mounted(mount) &&
+ mount->from_proc_self_mountinfo &&
+ mount->parameters_proc_self_mountinfo.what)
+ /* Track devices currently used */
+ if (set_put_strdup_full(&around, &path_hash_ops_free, mount->parameters_proc_self_mountinfo.what) < 0)
+ log_oom();
+
+ /* Reset the flags for later calls */
+ mount->proc_flags = 0;
+ }
+
+ SET_FOREACH(what, gone) {
+ if (set_contains(around, what))
+ continue;
+
+ /* Let the device units know that the device is no longer mounted */
+ device_found_node(m, what, DEVICE_NOT_FOUND, DEVICE_FOUND_MOUNT);
+ }
+
+ return 0;
+}
+
+static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(revents & EPOLLIN);
+
+ return mount_process_proc_self_mountinfo(m);
+}
+
+int mount_invalidate_state_by_path(Manager *manager, const char *path) {
+ _cleanup_free_ char *name = NULL;
+ Unit *u;
+ int r;
+
+ assert(manager);
+ assert(path);
+
+ r = unit_name_from_path(path, ".mount", &name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to generate unit name from path \"%s\", ignoring: %m", path);
+
+ u = manager_get_unit(manager, name);
+ if (!u)
+ return -ENOENT;
+
+ MOUNT(u)->invalidated_state = true;
+ return 0;
+}
+
+static void mount_reset_failed(Unit *u) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ if (m->state == MOUNT_FAILED)
+ mount_set_state(m, MOUNT_DEAD);
+
+ m->result = MOUNT_SUCCESS;
+ m->reload_result = MOUNT_SUCCESS;
+ m->clean_result = MOUNT_SUCCESS;
+}
+
+static PidRef* mount_control_pid(Unit *u) {
+ return &ASSERT_PTR(MOUNT(u))->control_pid;
+}
+
+static int mount_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+ assert(mask != 0);
+
+ if (m->state != MOUNT_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&m->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ mount_unwatch_control_pid(m);
+ m->clean_result = MOUNT_SUCCESS;
+ m->control_command = NULL;
+ m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+
+ r = mount_arm_timer(m, /* relative= */ true, m->exec_context.timeout_clean_usec);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ r = unit_fork_and_watch_rm_rf(u, l, &m->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+ goto fail;
+ }
+
+ mount_set_state(m, MOUNT_CLEANING);
+ return 0;
+
+fail:
+ m->clean_result = MOUNT_FAILURE_RESOURCES;
+ m->timer_event_source = sd_event_source_disable_unref(m->timer_event_source);
+ return r;
+}
+
+static int mount_can_clean(Unit *u, ExecCleanMask *ret) {
+ Mount *m = MOUNT(u);
+
+ assert(m);
+
+ return exec_context_get_clean_mask(&m->exec_context, ret);
+}
+
+static int mount_can_start(Unit *u) {
+ Mount *m = MOUNT(u);
+ int r;
+
+ assert(m);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ mount_enter_dead(m, MOUNT_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static int mount_subsystem_ratelimited(Manager *m) {
+ assert(m);
+
+ if (!m->mount_event_source)
+ return false;
+
+ return sd_event_source_is_ratelimited(m->mount_event_source);
+}
+
+char* mount_get_what_escaped(const Mount *m) {
+ _cleanup_free_ char *escaped = NULL;
+ const char *s = NULL;
+
+ assert(m);
+
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.what)
+ s = m->parameters_proc_self_mountinfo.what;
+ else if (m->from_fragment && m->parameters_fragment.what)
+ s = m->parameters_fragment.what;
+
+ if (s) {
+ escaped = utf8_escape_invalid(s);
+ if (!escaped)
+ return NULL;
+ }
+
+ return escaped ? TAKE_PTR(escaped) : strdup("");
+}
+
+char* mount_get_options_escaped(const Mount *m) {
+ _cleanup_free_ char *escaped = NULL;
+ const char *s = NULL;
+
+ assert(m);
+
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.options)
+ s = m->parameters_proc_self_mountinfo.options;
+ else if (m->from_fragment && m->parameters_fragment.options)
+ s = m->parameters_fragment.options;
+
+ if (s) {
+ escaped = utf8_escape_invalid(s);
+ if (!escaped)
+ return NULL;
+ }
+
+ return escaped ? TAKE_PTR(escaped) : strdup("");
+}
+
+const char* mount_get_fstype(const Mount *m) {
+ assert(m);
+
+ if (m->from_proc_self_mountinfo && m->parameters_proc_self_mountinfo.fstype)
+ return m->parameters_proc_self_mountinfo.fstype;
+
+ if (m->from_fragment && m->parameters_fragment.fstype)
+ return m->parameters_fragment.fstype;
+
+ return NULL;
+}
+
+static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = {
+ [MOUNT_EXEC_MOUNT] = "ExecMount",
+ [MOUNT_EXEC_UNMOUNT] = "ExecUnmount",
+ [MOUNT_EXEC_REMOUNT] = "ExecRemount",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_exec_command, MountExecCommand);
+
+static const char* const mount_result_table[_MOUNT_RESULT_MAX] = {
+ [MOUNT_SUCCESS] = "success",
+ [MOUNT_FAILURE_RESOURCES] = "resources",
+ [MOUNT_FAILURE_TIMEOUT] = "timeout",
+ [MOUNT_FAILURE_EXIT_CODE] = "exit-code",
+ [MOUNT_FAILURE_SIGNAL] = "signal",
+ [MOUNT_FAILURE_CORE_DUMP] = "core-dump",
+ [MOUNT_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [MOUNT_FAILURE_PROTOCOL] = "protocol",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mount_result, MountResult);
+
+const UnitVTable mount_vtable = {
+ .object_size = sizeof(Mount),
+ .exec_context_offset = offsetof(Mount, exec_context),
+ .cgroup_context_offset = offsetof(Mount, cgroup_context),
+ .kill_context_offset = offsetof(Mount, kill_context),
+ .exec_runtime_offset = offsetof(Mount, exec_runtime),
+
+ .sections =
+ "Unit\0"
+ "Mount\0"
+ "Install\0",
+ .private_section = "Mount",
+
+ .can_transient = true,
+ .can_fail = true,
+ .exclude_from_switch_root_serialization = true,
+
+ .init = mount_init,
+ .load = mount_load,
+ .done = mount_done,
+
+ .coldplug = mount_coldplug,
+ .catchup = mount_catchup,
+
+ .dump = mount_dump,
+
+ .start = mount_start,
+ .stop = mount_stop,
+ .reload = mount_reload,
+
+ .clean = mount_clean,
+ .can_clean = mount_can_clean,
+
+ .serialize = mount_serialize,
+ .deserialize_item = mount_deserialize_item,
+
+ .active_state = mount_active_state,
+ .sub_state_to_string = mount_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = mount_may_gc,
+ .is_extrinsic = mount_is_extrinsic,
+
+ .sigchld_event = mount_sigchld_event,
+
+ .reset_failed = mount_reset_failed,
+
+ .control_pid = mount_control_pid,
+
+ .bus_set_property = bus_mount_set_property,
+ .bus_commit_properties = bus_mount_commit_properties,
+
+ .get_timeout = mount_get_timeout,
+
+ .enumerate_perpetual = mount_enumerate_perpetual,
+ .enumerate = mount_enumerate,
+ .shutdown = mount_shutdown,
+ .subsystem_ratelimited = mount_subsystem_ratelimited,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Mounting %s...",
+ [1] = "Unmounting %s...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Mounted %s.",
+ [JOB_FAILED] = "Failed to mount %s.",
+ [JOB_TIMEOUT] = "Timed out mounting %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Unmounted %s.",
+ [JOB_FAILED] = "Failed unmounting %s.",
+ [JOB_TIMEOUT] = "Timed out unmounting %s.",
+ },
+ },
+
+ .can_start = mount_can_start,
+
+ .notify_plymouth = true,
+};
diff --git a/src/core/mount.h b/src/core/mount.h
new file mode 100644
index 0000000..6712c16
--- /dev/null
+++ b/src/core/mount.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Mount Mount;
+
+#include "dynamic-user.h"
+#include "kill.h"
+#include "pidref.h"
+#include "unit.h"
+
+typedef enum MountExecCommand {
+ MOUNT_EXEC_MOUNT,
+ MOUNT_EXEC_UNMOUNT,
+ MOUNT_EXEC_REMOUNT,
+ _MOUNT_EXEC_COMMAND_MAX,
+ _MOUNT_EXEC_COMMAND_INVALID = -EINVAL,
+} MountExecCommand;
+
+typedef enum MountResult {
+ MOUNT_SUCCESS,
+ MOUNT_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+ MOUNT_FAILURE_TIMEOUT,
+ MOUNT_FAILURE_EXIT_CODE,
+ MOUNT_FAILURE_SIGNAL,
+ MOUNT_FAILURE_CORE_DUMP,
+ MOUNT_FAILURE_START_LIMIT_HIT,
+ MOUNT_FAILURE_PROTOCOL,
+ _MOUNT_RESULT_MAX,
+ _MOUNT_RESULT_INVALID = -EINVAL,
+} MountResult;
+
+typedef struct MountParameters {
+ char *what;
+ char *options;
+ char *fstype;
+} MountParameters;
+
+/* Used while looking for mount points that vanished or got added from/to /proc/self/mountinfo */
+typedef enum MountProcFlags {
+ MOUNT_PROC_IS_MOUNTED = 1 << 0,
+ MOUNT_PROC_JUST_MOUNTED = 1 << 1,
+ MOUNT_PROC_JUST_CHANGED = 1 << 2,
+} MountProcFlags;
+
+struct Mount {
+ Unit meta;
+
+ char *where;
+
+ MountParameters parameters_proc_self_mountinfo;
+ MountParameters parameters_fragment;
+
+ bool invalidated_state:1; /* Set when the 'state' of the mount unit may be outdated, and we need to
+ * re-read /proc/self/mountinfo. */
+ bool from_proc_self_mountinfo:1;
+ bool from_fragment:1;
+
+ MountProcFlags proc_flags;
+
+ bool sloppy_options;
+
+ bool lazy_unmount;
+ bool force_unmount;
+
+ bool read_write_only;
+
+ MountResult result;
+ MountResult reload_result;
+ MountResult clean_result;
+
+ mode_t directory_mode;
+
+ usec_t timeout_usec;
+
+ ExecCommand exec_command[_MOUNT_EXEC_COMMAND_MAX];
+
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+
+ MountState state, deserialized_state;
+
+ ExecCommand* control_command;
+ MountExecCommand control_command_id;
+ PidRef control_pid;
+
+ sd_event_source *timer_event_source;
+
+ unsigned n_retry_umount;
+};
+
+extern const UnitVTable mount_vtable;
+
+void mount_fd_event(Manager *m, int events);
+
+int mount_invalidate_state_by_path(Manager *manager, const char *path);
+
+char* mount_get_what_escaped(const Mount *m);
+char* mount_get_options_escaped(const Mount *m);
+const char* mount_get_fstype(const Mount *m);
+
+const char* mount_exec_command_to_string(MountExecCommand i) _const_;
+MountExecCommand mount_exec_command_from_string(const char *s) _pure_;
+
+const char* mount_result_to_string(MountResult i) _const_;
+MountResult mount_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(MOUNT, Mount);
diff --git a/src/core/namespace.c b/src/core/namespace.c
new file mode 100644
index 0000000..88681aa
--- /dev/null
+++ b/src/core/namespace.c
@@ -0,0 +1,3047 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <linux/loop.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#if WANT_LINUX_FS_H
+#include <linux/fs.h>
+#endif
+
+#include "alloc-util.h"
+#include "base-filesystem.h"
+#include "chase.h"
+#include "dev-setup.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "extension-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "glyph-util.h"
+#include "label-util.h"
+#include "list.h"
+#include "lock-util.h"
+#include "loop-util.h"
+#include "loopback-setup.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "namespace.h"
+#include "nsflags.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
+
+typedef enum MountMode {
+ /* This is ordered by priority! */
+ MOUNT_INACCESSIBLE,
+ MOUNT_OVERLAY,
+ MOUNT_IMAGE,
+ MOUNT_BIND,
+ MOUNT_BIND_RECURSIVE,
+ MOUNT_PRIVATE_TMP,
+ MOUNT_PRIVATE_TMP_READ_ONLY,
+ MOUNT_PRIVATE_DEV,
+ MOUNT_BIND_DEV,
+ MOUNT_EMPTY_DIR,
+ MOUNT_PRIVATE_SYSFS,
+ MOUNT_BIND_SYSFS,
+ MOUNT_PROCFS,
+ MOUNT_READ_ONLY,
+ MOUNT_READ_WRITE,
+ MOUNT_NOEXEC,
+ MOUNT_EXEC,
+ MOUNT_TMPFS,
+ MOUNT_RUN,
+ MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */
+ MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
+ MOUNT_MQUEUEFS,
+ MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
+ _MOUNT_MODE_MAX,
+ _MOUNT_MODE_INVALID = -EINVAL,
+} MountMode;
+
+typedef enum MountEntryState {
+ MOUNT_PENDING,
+ MOUNT_APPLIED,
+ MOUNT_SKIPPED,
+ _MOUNT_ENTRY_STATE_MAX,
+ _MOUNT_ENTRY_STATE_INVALID = -EINVAL,
+} MountEntryState;
+
+typedef struct MountEntry {
+ const char *path_const; /* Memory allocated on stack or static */
+ MountMode mode;
+ bool ignore:1; /* Ignore if path does not exist? */
+ bool has_prefix:1; /* Already is prefixed by the root dir? */
+ bool read_only:1; /* Shall this mount point be read-only? */
+ bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
+ bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */
+ bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */
+ MountEntryState state; /* Whether it was already processed or skipped */
+ char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
+ const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */
+ char *unprefixed_path_malloc;
+ const char *source_const; /* The source path, for bind mounts or images */
+ char *source_malloc;
+ const char *options_const;/* Mount options for tmpfs */
+ char *options_malloc;
+ unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
+ unsigned n_followed;
+ LIST_HEAD(MountOptions, image_options_const);
+ char **overlay_layers;
+} MountEntry;
+
+typedef struct MountList {
+ MountEntry *mounts;
+ size_t n_mounts;
+} MountList;
+
+/* If MountAPIVFS= is used, let's mount /sys, /proc, /dev and /run into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+ { "/proc", MOUNT_PROCFS, false },
+ { "/dev", MOUNT_BIND_DEV, false },
+ { "/sys", MOUNT_BIND_SYSFS, false },
+ { "/run", MOUNT_RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const MountEntry protect_kernel_tunables_proc_table[] = {
+ { "/proc/acpi", MOUNT_READ_ONLY, true },
+ { "/proc/apm", MOUNT_READ_ONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
+ { "/proc/asound", MOUNT_READ_ONLY, true },
+ { "/proc/bus", MOUNT_READ_ONLY, true },
+ { "/proc/fs", MOUNT_READ_ONLY, true },
+ { "/proc/irq", MOUNT_READ_ONLY, true },
+ { "/proc/kallsyms", MOUNT_INACCESSIBLE, true },
+ { "/proc/kcore", MOUNT_INACCESSIBLE, true },
+ { "/proc/latency_stats", MOUNT_READ_ONLY, true },
+ { "/proc/mtrr", MOUNT_READ_ONLY, true },
+ { "/proc/scsi", MOUNT_READ_ONLY, true },
+ { "/proc/sys", MOUNT_READ_ONLY, true },
+ { "/proc/sysrq-trigger", MOUNT_READ_ONLY, true },
+ { "/proc/timer_stats", MOUNT_READ_ONLY, true },
+};
+
+static const MountEntry protect_kernel_tunables_sys_table[] = {
+ { "/sys", MOUNT_READ_ONLY, false },
+ { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
+ { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
+ { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
+ { "/sys/kernel/debug", MOUNT_READ_ONLY, true },
+ { "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
+};
+
+/* ProtectKernelModules= option */
+static const MountEntry protect_kernel_modules_table[] = {
+ { "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
+};
+
+/* ProtectKernelLogs= option */
+static const MountEntry protect_kernel_logs_proc_table[] = {
+ { "/proc/kmsg", MOUNT_INACCESSIBLE, true },
+};
+
+static const MountEntry protect_kernel_logs_dev_table[] = {
+ { "/dev/kmsg", MOUNT_INACCESSIBLE, true },
+};
+
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const MountEntry protect_home_read_only_table[] = {
+ { "/home", MOUNT_READ_ONLY, true },
+ { "/run/user", MOUNT_READ_ONLY, true },
+ { "/root", MOUNT_READ_ONLY, true },
+};
+
+/* ProtectHome=tmpfs table */
+static const MountEntry protect_home_tmpfs_table[] = {
+ { "/home", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+ { "/run/user", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+ { "/root", MOUNT_TMPFS, true, .read_only = true, .options_const = "mode=0700" TMPFS_LIMITS_EMPTY_OR_ALMOST, .flags = MS_NODEV|MS_STRICTATIME },
+};
+
+/* ProtectHome=yes table */
+static const MountEntry protect_home_yes_table[] = {
+ { "/home", MOUNT_INACCESSIBLE, true },
+ { "/run/user", MOUNT_INACCESSIBLE, true },
+ { "/root", MOUNT_INACCESSIBLE, true },
+};
+
+/* ProtectSystem=yes table */
+static const MountEntry protect_system_yes_table[] = {
+ { "/usr", MOUNT_READ_ONLY, false },
+ { "/boot", MOUNT_READ_ONLY, true },
+ { "/efi", MOUNT_READ_ONLY, true },
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const MountEntry protect_system_full_table[] = {
+ { "/usr", MOUNT_READ_ONLY, false },
+ { "/boot", MOUNT_READ_ONLY, true },
+ { "/efi", MOUNT_READ_ONLY, true },
+ { "/etc", MOUNT_READ_ONLY, false },
+};
+
+/* ProtectSystem=strict table. In this strict mode, we mount everything read-only, except for /proc, /dev,
+ * /sys which are the kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal. (And of course /home and friends are also
+ * left writable, as ProtectHome= shall manage those, orthogonally).
+ */
+static const MountEntry protect_system_strict_table[] = {
+ { "/", MOUNT_READ_ONLY, false },
+ { "/proc", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
+ { "/sys", MOUNT_READ_WRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
+ { "/dev", MOUNT_READ_WRITE_IMPLICIT, false }, /* PrivateDevices= */
+ { "/home", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
+ { "/run/user", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
+ { "/root", MOUNT_READ_WRITE_IMPLICIT, true }, /* ProtectHome= */
+};
+
+/* ProtectHostname=yes able */
+static const MountEntry protect_hostname_table[] = {
+ { "/proc/sys/kernel/hostname", MOUNT_READ_ONLY, false },
+ { "/proc/sys/kernel/domainname", MOUNT_READ_ONLY, false },
+};
+
+static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
+ [MOUNT_INACCESSIBLE] = "inaccessible",
+ [MOUNT_OVERLAY] = "overlay",
+ [MOUNT_IMAGE] = "image",
+ [MOUNT_BIND] = "bind",
+ [MOUNT_BIND_RECURSIVE] = "bind-recursive",
+ [MOUNT_PRIVATE_TMP] = "private-tmp",
+ [MOUNT_PRIVATE_TMP_READ_ONLY] = "private-tmp-read-only",
+ [MOUNT_PRIVATE_DEV] = "private-dev",
+ [MOUNT_BIND_DEV] = "bind-dev",
+ [MOUNT_EMPTY_DIR] = "empty-dir",
+ [MOUNT_PRIVATE_SYSFS] = "private-sysfs",
+ [MOUNT_BIND_SYSFS] = "bind-sysfs",
+ [MOUNT_PROCFS] = "procfs",
+ [MOUNT_READ_ONLY] = "read-only",
+ [MOUNT_READ_WRITE] = "read-write",
+ [MOUNT_NOEXEC] = "noexec",
+ [MOUNT_EXEC] = "exec",
+ [MOUNT_TMPFS] = "tmpfs",
+ [MOUNT_RUN] = "run",
+ [MOUNT_EXTENSION_DIRECTORY] = "extension-directory",
+ [MOUNT_EXTENSION_IMAGE] = "extension-image",
+ [MOUNT_MQUEUEFS] = "mqueuefs",
+ [MOUNT_READ_WRITE_IMPLICIT] = "read-write-implicit",
+};
+
+/* Helper struct for naming simplicity and reusability */
+static const struct {
+ const char *level_env;
+ const char *level_env_print;
+} image_class_info[_IMAGE_CLASS_MAX] = {
+ [IMAGE_SYSEXT] = {
+ .level_env = "SYSEXT_LEVEL",
+ .level_env_print = " SYSEXT_LEVEL=",
+ },
+ [IMAGE_CONFEXT] = {
+ .level_env = "CONFEXT_LEVEL",
+ .level_env_print = " CONFEXT_LEVEL=",
+ }
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
+
+static const char *mount_entry_path(const MountEntry *p) {
+ assert(p);
+
+ /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
+ * otherwise the stack/static ->path field is returned. */
+
+ return p->path_malloc ?: p->path_const;
+}
+
+static const char *mount_entry_unprefixed_path(const MountEntry *p) {
+ assert(p);
+
+ /* Returns the unprefixed path (ie: before prefix_where_needed() ran), if any */
+
+ return p->unprefixed_path_malloc ?: p->unprefixed_path_const ?: mount_entry_path(p);
+}
+
+static void mount_entry_consume_prefix(MountEntry *p, char *new_path) {
+ assert(p);
+ assert(p->path_malloc || p->path_const);
+ assert(new_path);
+
+ /* Saves current path in unprefixed_ variable, and takes over new_path */
+
+ free_and_replace(p->unprefixed_path_malloc, p->path_malloc);
+ /* If we didn't have a path on the heap, then it's a static one */
+ if (!p->unprefixed_path_malloc)
+ p->unprefixed_path_const = p->path_const;
+ p->path_malloc = new_path;
+ p->has_prefix = true;
+}
+
+static bool mount_entry_read_only(const MountEntry *p) {
+ assert(p);
+
+ return p->read_only || IN_SET(p->mode, MOUNT_READ_ONLY, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_TMP_READ_ONLY);
+}
+
+static bool mount_entry_noexec(const MountEntry *p) {
+ assert(p);
+
+ return p->noexec || IN_SET(p->mode, MOUNT_NOEXEC, MOUNT_INACCESSIBLE, MOUNT_PRIVATE_SYSFS, MOUNT_BIND_SYSFS, MOUNT_PROCFS);
+}
+
+static bool mount_entry_exec(const MountEntry *p) {
+ assert(p);
+
+ return p->exec || p->mode == MOUNT_EXEC;
+}
+
+static const char *mount_entry_source(const MountEntry *p) {
+ assert(p);
+
+ return p->source_malloc ?: p->source_const;
+}
+
+static const char *mount_entry_options(const MountEntry *p) {
+ assert(p);
+
+ return p->options_malloc ?: p->options_const;
+}
+
+static void mount_entry_done(MountEntry *p) {
+ assert(p);
+
+ p->path_malloc = mfree(p->path_malloc);
+ p->unprefixed_path_malloc = mfree(p->unprefixed_path_malloc);
+ p->source_malloc = mfree(p->source_malloc);
+ p->options_malloc = mfree(p->options_malloc);
+ p->overlay_layers = strv_free(p->overlay_layers);
+}
+
+static void mount_list_done(MountList *ml) {
+ assert(ml);
+
+ FOREACH_ARRAY(m, ml->mounts, ml->n_mounts)
+ mount_entry_done(m);
+
+ ml->mounts = mfree(ml->mounts);
+ ml->n_mounts = 0;
+}
+
+static MountEntry *mount_list_extend(MountList *ml) {
+ assert(ml);
+
+ if (!GREEDY_REALLOC0(ml->mounts, ml->n_mounts+1))
+ return NULL;
+
+ return ml->mounts + ml->n_mounts++;
+}
+
+static int append_access_mounts(MountList *ml, char **strv, MountMode mode, bool forcibly_require_prefix) {
+ assert(ml);
+
+ /* Adds a list of user-supplied READ_WRITE/READ_WRITE_IMPLICIT/READ_ONLY/INACCESSIBLE entries */
+
+ STRV_FOREACH(i, strv) {
+ bool ignore = false, needs_prefix = false;
+ const char *e = *i;
+
+ /* Look for any prefixes */
+ if (startswith(e, "-")) {
+ e++;
+ ignore = true;
+ }
+ if (startswith(e, "+")) {
+ e++;
+ needs_prefix = true;
+ }
+
+ if (!path_is_absolute(e))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", e);
+
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = e,
+ .mode = mode,
+ .ignore = ignore,
+ .has_prefix = !needs_prefix && !forcibly_require_prefix,
+ };
+ }
+
+ return 0;
+}
+
+static int append_empty_dir_mounts(MountList *ml, char **strv) {
+ assert(ml);
+
+ /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
+ * "/private/" boundary directories for DynamicUser=1. */
+
+ STRV_FOREACH(i, strv) {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = *i,
+ .mode = MOUNT_EMPTY_DIR,
+ .ignore = false,
+ .read_only = true,
+ .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+ .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
+ };
+ }
+
+ return 0;
+}
+
+static int append_bind_mounts(MountList *ml, const BindMount *binds, size_t n) {
+ assert(ml);
+ assert(binds || n == 0);
+
+ FOREACH_ARRAY(b, binds, n) {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = b->destination,
+ .mode = b->recursive ? MOUNT_BIND_RECURSIVE : MOUNT_BIND,
+ .read_only = b->read_only,
+ .nosuid = b->nosuid,
+ .source_const = b->source,
+ .ignore = b->ignore_enoent,
+ };
+ }
+
+ return 0;
+}
+
+static int append_mount_images(MountList *ml, const MountImage *mount_images, size_t n) {
+ assert(ml);
+ assert(mount_images || n == 0);
+
+ FOREACH_ARRAY(m, mount_images, n) {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = m->destination,
+ .mode = MOUNT_IMAGE,
+ .source_const = m->source,
+ .image_options_const = m->mount_options,
+ .ignore = m->ignore_enoent,
+ };
+ }
+
+ return 0;
+}
+
+static int append_extensions(
+ MountList *ml,
+ const char *root,
+ const char *extension_dir,
+ char **hierarchies,
+ const MountImage *mount_images,
+ size_t n,
+ char **extension_directories) {
+
+ char ***overlays = NULL;
+ size_t n_overlays = 0;
+ int r;
+
+ assert(ml);
+
+ if (n == 0 && strv_isempty(extension_directories))
+ return 0;
+
+ assert(extension_dir);
+
+ n_overlays = strv_length(hierarchies);
+ if (n_overlays == 0)
+ return 0;
+
+ /* Prepare a list of overlays, that will have as each element a strv containing all the layers that
+ * will later be concatenated as a lowerdir= parameter for the mount operation.
+ * The overlays vector will have the same number of elements and will correspond to the
+ * hierarchies vector, so they can be iterated upon together. */
+ overlays = new0(char**, n_overlays);
+ if (!overlays)
+ return -ENOMEM;
+
+ CLEANUP_ARRAY(overlays, n_overlays, strv_free_many);
+
+ /* First, prepare a mount for each image, but these won't be visible to the unit, instead
+ * they will be mounted in our propagate directory, and used as a source for the overlay. */
+ for (size_t i = 0; i < n; i++) {
+ _cleanup_free_ char *mount_point = NULL;
+ const MountImage *m = mount_images + i;
+
+ if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0)
+ return -ENOMEM;
+
+ for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+ char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
+ if (r < 0)
+ return r;
+ }
+
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return -ENOMEM;
+
+ *me = (MountEntry) {
+ .path_malloc = TAKE_PTR(mount_point),
+ .image_options_const = m->mount_options,
+ .ignore = m->ignore_enoent,
+ .source_const = m->source,
+ .mode = MOUNT_EXTENSION_IMAGE,
+ .has_prefix = true,
+ };
+ }
+
+ /* Secondly, extend the lowerdir= parameters with each ExtensionDirectory.
+ * Bind mount them in the same location as the ExtensionImages, so that we
+ * can check that they are valid trees (extension-release.d). */
+ STRV_FOREACH(extension_directory, extension_directories) {
+ _cleanup_free_ char *mount_point = NULL, *source = NULL;
+ const char *e = *extension_directory;
+ bool ignore_enoent = false;
+
+ /* Pick up the counter where the ExtensionImages left it. */
+ if (asprintf(&mount_point, "%s/%zu", extension_dir, n++) < 0)
+ return -ENOMEM;
+
+ /* Look for any prefixes */
+ if (startswith(e, "-")) {
+ e++;
+ ignore_enoent = true;
+ }
+ /* Ignore this for now */
+ if (startswith(e, "+"))
+ e++;
+
+ source = strdup(e);
+ if (!source)
+ return -ENOMEM;
+
+ for (size_t j = 0; hierarchies && hierarchies[j]; ++j) {
+ char *prefixed_hierarchy = path_join(mount_point, hierarchies[j]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ r = strv_consume(&overlays[j], TAKE_PTR(prefixed_hierarchy));
+ if (r < 0)
+ return r;
+ }
+
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return -ENOMEM;
+
+ *me = (MountEntry) {
+ .path_malloc = TAKE_PTR(mount_point),
+ .source_malloc = TAKE_PTR(source),
+ .mode = MOUNT_EXTENSION_DIRECTORY,
+ .ignore = ignore_enoent,
+ .has_prefix = true,
+ .read_only = true,
+ };
+ }
+
+ /* Then, for each hierarchy, prepare an overlay with the list of lowerdir= strings
+ * set up earlier. */
+ for (size_t i = 0; hierarchies && hierarchies[i]; ++i) {
+ _cleanup_free_ char *prefixed_hierarchy = NULL;
+
+ prefixed_hierarchy = path_join(root, hierarchies[i]);
+ if (!prefixed_hierarchy)
+ return -ENOMEM;
+
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return -ENOMEM;
+
+ *me = (MountEntry) {
+ .path_malloc = TAKE_PTR(prefixed_hierarchy),
+ .overlay_layers = TAKE_PTR(overlays[i]),
+ .mode = MOUNT_OVERLAY,
+ .has_prefix = true,
+ .ignore = true, /* If the source image doesn't set the ignore bit it will fail earlier. */
+ };
+ }
+
+ return 0;
+}
+
+static int append_tmpfs_mounts(MountList *ml, const TemporaryFileSystem *tmpfs, size_t n) {
+ assert(ml);
+ assert(tmpfs || n == 0);
+
+ FOREACH_ARRAY(t, tmpfs, n) {
+ _cleanup_free_ char *o = NULL, *str = NULL;
+ unsigned long flags;
+ bool ro = false;
+ int r;
+
+ if (!path_is_absolute(t->path))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not absolute: %s", t->path);
+
+ str = strjoin("mode=0755" NESTED_TMPFS_LIMITS ",", t->options);
+ if (!str)
+ return -ENOMEM;
+
+ r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
+
+ ro = flags & MS_RDONLY;
+ if (ro)
+ flags ^= MS_RDONLY;
+
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = t->path,
+ .mode = MOUNT_TMPFS,
+ .read_only = ro,
+ .options_malloc = TAKE_PTR(o),
+ .flags = flags,
+ };
+ }
+
+ return 0;
+}
+
+static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t n, bool ignore_protect) {
+ assert(ml);
+ assert(mounts || n == 0);
+
+ /* Adds a list of static pre-defined entries */
+
+ FOREACH_ARRAY(m, mounts, n) {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = mount_entry_path(m),
+ .mode = m->mode,
+ .ignore = m->ignore || ignore_protect,
+ };
+ }
+
+ return 0;
+}
+
+static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
+ assert(ml);
+
+ switch (protect_home) {
+
+ case PROTECT_HOME_NO:
+ return 0;
+
+ case PROTECT_HOME_READ_ONLY:
+ return append_static_mounts(ml, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
+
+ case PROTECT_HOME_TMPFS:
+ return append_static_mounts(ml, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
+
+ case PROTECT_HOME_YES:
+ return append_static_mounts(ml, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int append_protect_system(MountList *ml, ProtectSystem protect_system, bool ignore_protect) {
+ assert(ml);
+
+ switch (protect_system) {
+
+ case PROTECT_SYSTEM_NO:
+ return 0;
+
+ case PROTECT_SYSTEM_STRICT:
+ return append_static_mounts(ml, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
+
+ case PROTECT_SYSTEM_YES:
+ return append_static_mounts(ml, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
+
+ case PROTECT_SYSTEM_FULL:
+ return append_static_mounts(ml, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
+ int d;
+
+ /* ExtensionImages/Directories will be used by other mounts as a base, so sort them first
+ * regardless of the prefix - they are set up in the propagate directory anyway */
+ d = -CMP(a->mode == MOUNT_EXTENSION_IMAGE, b->mode == MOUNT_EXTENSION_IMAGE);
+ if (d != 0)
+ return d;
+ d = -CMP(a->mode == MOUNT_EXTENSION_DIRECTORY, b->mode == MOUNT_EXTENSION_DIRECTORY);
+ if (d != 0)
+ return d;
+
+ /* If the paths are not equal, then order prefixes first */
+ d = path_compare(mount_entry_path(a), mount_entry_path(b));
+ if (d != 0)
+ return d;
+
+ /* If the paths are equal, check the mode */
+ return CMP((int) a->mode, (int) b->mode);
+}
+
+static int prefix_where_needed(MountList *ml, const char *root_directory) {
+ /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
+
+ assert(ml);
+
+ FOREACH_ARRAY(me, ml->mounts, ml->n_mounts) {
+ char *s;
+
+ if (me->has_prefix)
+ continue;
+
+ s = path_join(root_directory, mount_entry_path(me));
+ if (!s)
+ return -ENOMEM;
+
+ mount_entry_consume_prefix(me, s);
+ }
+
+ return 0;
+}
+
+static void drop_duplicates(MountList *ml) {
+ MountEntry *f, *t, *previous;
+
+ assert(ml);
+
+ /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
+ for (f = ml->mounts, t = ml->mounts, previous = NULL; f < ml->mounts + ml->n_mounts; f++) {
+
+ /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+ * above. Note that we only drop duplicates that haven't been mounted yet. */
+ if (previous &&
+ path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
+ f->state == MOUNT_PENDING && previous->state == MOUNT_PENDING) {
+ log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
+ /* Propagate the flags to the remaining entry */
+ previous->read_only = previous->read_only || mount_entry_read_only(f);
+ previous->noexec = previous->noexec || mount_entry_noexec(f);
+ previous->exec = previous->exec || mount_entry_exec(f);
+ mount_entry_done(f);
+ continue;
+ }
+
+ *t = *f;
+ previous = t;
+ t++;
+ }
+
+ ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_inaccessible(MountList *ml) {
+ MountEntry *f, *t;
+ const char *clear = NULL;
+
+ assert(ml);
+
+ /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+ * ordered already. */
+
+ for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+ /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+ * it, as inaccessible paths really should drop the entire subtree. */
+ if (clear && path_startswith(mount_entry_path(f), clear)) {
+ log_debug("%s is masked by %s.", mount_entry_path(f), clear);
+ mount_entry_done(f);
+ continue;
+ }
+
+ clear = f->mode == MOUNT_INACCESSIBLE ? mount_entry_path(f) : NULL;
+
+ *t = *f;
+ t++;
+ }
+
+ ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_nop(MountList *ml) {
+ MountEntry *f, *t;
+
+ assert(ml);
+
+ /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+ * list is ordered by prefixes. */
+
+ for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+ /* Only suppress such subtrees for READ_ONLY, READ_WRITE and READ_WRITE_IMPLICIT entries */
+ if (IN_SET(f->mode, MOUNT_READ_ONLY, MOUNT_READ_WRITE, MOUNT_READ_WRITE_IMPLICIT)) {
+ MountEntry *found = NULL;
+
+ /* Now let's find the first parent of the entry we are looking at. */
+ for (MountEntry *p = PTR_SUB1(t, ml->mounts); p; p = PTR_SUB1(p, ml->mounts))
+ if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
+ found = p;
+ break;
+ }
+
+ /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+ if (found && found->mode == f->mode) {
+ log_debug("%s (%s) is made redundant by %s (%s)",
+ mount_entry_path(f), mount_mode_to_string(f->mode),
+ mount_entry_path(found), mount_mode_to_string(found->mode));
+ mount_entry_done(f);
+ continue;
+ }
+ }
+
+ *t = *f;
+ t++;
+ }
+
+ ml->n_mounts = t - ml->mounts;
+}
+
+static void drop_outside_root(MountList *ml, const char *root_directory) {
+ MountEntry *f, *t;
+
+ assert(ml);
+
+ /* Nothing to do */
+ if (!root_directory)
+ return;
+
+ /* Drops all mounts that are outside of the root directory. */
+
+ for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) {
+
+ /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */
+ if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) {
+ log_debug("%s is outside of root directory.", mount_entry_path(f));
+ mount_entry_done(f);
+ continue;
+ }
+
+ *t = *f;
+ t++;
+ }
+
+ ml->n_mounts = t - ml->mounts;
+}
+
+static int clone_device_node(
+ const char *d,
+ const char *temporary_mount,
+ bool *make_devnode) {
+
+ _cleanup_free_ char *sl = NULL;
+ const char *dn, *bn, *t;
+ struct stat st;
+ int r;
+
+ if (stat(d, &st) < 0) {
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
+ return -ENXIO;
+ }
+
+ return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
+ }
+
+ if (!S_ISBLK(st.st_mode) &&
+ !S_ISCHR(st.st_mode))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Device node '%s' to clone is not a device node, ignoring.",
+ d);
+
+ dn = strjoina(temporary_mount, d);
+
+ /* First, try to create device node properly */
+ if (*make_devnode) {
+ mac_selinux_create_file_prepare(d, st.st_mode);
+ r = mknod(dn, st.st_mode, st.st_rdev);
+ mac_selinux_create_file_clear();
+ if (r >= 0)
+ goto add_symlink;
+ if (errno != EPERM)
+ return log_debug_errno(errno, "mknod failed for %s: %m", d);
+
+ /* This didn't work, let's not try this again for the next iterations. */
+ *make_devnode = false;
+ }
+
+ /* We're about to fall back to bind-mounting the device node. So create a dummy bind-mount target.
+ * Do not prepare device-node SELinux label (see issue 13762) */
+ r = mknod(dn, S_IFREG, 0);
+ if (r < 0 && errno != EEXIST)
+ return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
+
+ /* Fallback to bind-mounting: The assumption here is that all used device nodes carry standard
+ * properties. Specifically, the devices nodes we bind-mount should either be owned by root:root or
+ * root:tty (e.g. /dev/tty, /dev/ptmx) and should not carry ACLs. */
+ r = mount_nofollow_verbose(LOG_DEBUG, d, dn, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+
+add_symlink:
+ bn = path_startswith(d, "/dev/");
+ if (!bn)
+ return 0;
+
+ /* Create symlinks like /dev/char/1:9 → ../urandom */
+ if (asprintf(&sl, "%s/dev/%s/" DEVNUM_FORMAT_STR,
+ temporary_mount,
+ S_ISCHR(st.st_mode) ? "char" : "block",
+ DEVNUM_FORMAT_VAL(st.st_rdev)) < 0)
+ return log_oom_debug();
+
+ (void) mkdir_parents(sl, 0755);
+
+ t = strjoina("../", bn);
+ if (symlink(t, sl) < 0)
+ log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
+
+ return 0;
+}
+
+static char *settle_runtime_dir(RuntimeScope scope) {
+ char *runtime_dir;
+
+ if (scope != RUNTIME_SCOPE_USER)
+ return strdup("/run/");
+
+ if (asprintf(&runtime_dir, "/run/user/" UID_FMT, geteuid()) < 0)
+ return NULL;
+
+ return runtime_dir;
+}
+
+static int create_temporary_mount_point(RuntimeScope scope, char **ret) {
+ _cleanup_free_ char *runtime_dir = NULL, *temporary_mount = NULL;
+
+ assert(ret);
+
+ runtime_dir = settle_runtime_dir(scope);
+ if (!runtime_dir)
+ return log_oom_debug();
+
+ temporary_mount = path_join(runtime_dir, "systemd/namespace-XXXXXX");
+ if (!temporary_mount)
+ return log_oom_debug();
+
+ if (!mkdtemp(temporary_mount))
+ return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
+
+ *ret = TAKE_PTR(temporary_mount);
+ return 0;
+}
+
+static int mount_private_dev(MountEntry *m, RuntimeScope scope) {
+ static const char devnodes[] =
+ "/dev/null\0"
+ "/dev/zero\0"
+ "/dev/full\0"
+ "/dev/random\0"
+ "/dev/urandom\0"
+ "/dev/tty\0";
+
+ _cleanup_free_ char *temporary_mount = NULL;
+ const char *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
+ bool can_mknod = true;
+ int r;
+
+ assert(m);
+
+ r = create_temporary_mount_point(scope, &temporary_mount);
+ if (r < 0)
+ return r;
+
+ dev = strjoina(temporary_mount, "/dev");
+ (void) mkdir(dev, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
+ if (r < 0)
+ goto fail;
+
+ r = label_fix_full(AT_FDCWD, dev, "/dev", 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to fix label of '%s' as /dev: %m", dev);
+ goto fail;
+ }
+
+ devpts = strjoina(temporary_mount, "/dev/pts");
+ (void) mkdir(devpts, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev/pts", devpts, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto fail;
+
+ /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
+ * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
+ * Thus, in that case make a clone.
+ * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
+ r = is_symlink("/dev/ptmx");
+ if (r < 0) {
+ log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
+ goto fail;
+ } else if (r > 0) {
+ devptmx = strjoina(temporary_mount, "/dev/ptmx");
+ if (symlink("pts/ptmx", devptmx) < 0) {
+ r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
+ goto fail;
+ }
+ } else {
+ r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
+ if (r < 0)
+ goto fail;
+ }
+
+ devshm = strjoina(temporary_mount, "/dev/shm");
+ (void) mkdir(devshm, 0755);
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev/shm", devshm, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto fail;
+
+ devmqueue = strjoina(temporary_mount, "/dev/mqueue");
+ (void) mkdir(devmqueue, 0755);
+ (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
+
+ devhugepages = strjoina(temporary_mount, "/dev/hugepages");
+ (void) mkdir(devhugepages, 0755);
+ (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
+
+ devlog = strjoina(temporary_mount, "/dev/log");
+ if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
+ log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
+
+ NULSTR_FOREACH(d, devnodes) {
+ r = clone_device_node(d, temporary_mount, &can_mknod);
+ /* ENXIO means the *source* is not a device file, skip creation in that case */
+ if (r < 0 && r != -ENXIO)
+ goto fail;
+ }
+
+ r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set up basic device tree at '%s', ignoring: %m", temporary_mount);
+
+ /* Create the /dev directory if missing. It is more likely to be missing when the service is started
+ * with RootDirectory. This is consistent with mount units creating the mount points when missing. */
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ /* Unmount everything in old /dev */
+ r = umount_recursive(mount_entry_path(m), 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
+
+ r = mount_nofollow_verbose(LOG_DEBUG, dev, mount_entry_path(m), NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto fail;
+
+ (void) rmdir(dev);
+ (void) rmdir(temporary_mount);
+
+ return 1;
+
+fail:
+ if (devpts)
+ (void) umount_verbose(LOG_DEBUG, devpts, UMOUNT_NOFOLLOW);
+
+ if (devshm)
+ (void) umount_verbose(LOG_DEBUG, devshm, UMOUNT_NOFOLLOW);
+
+ if (devhugepages)
+ (void) umount_verbose(LOG_DEBUG, devhugepages, UMOUNT_NOFOLLOW);
+
+ if (devmqueue)
+ (void) umount_verbose(LOG_DEBUG, devmqueue, UMOUNT_NOFOLLOW);
+
+ (void) umount_verbose(LOG_DEBUG, dev, UMOUNT_NOFOLLOW);
+ (void) rmdir(dev);
+ (void) rmdir(temporary_mount);
+
+ return r;
+}
+
+static int mount_bind_dev(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the
+ * service's /dev. This is only used when RootDirectory= is set. */
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+ if (r > 0) /* make this a NOP if /dev is already a mount point */
+ return 0;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_bind_sysfs(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+ if (r > 0) /* make this a NOP if /sys is already a mount point */
+ return 0;
+
+ /* Bind mount the host's version so that we get all child mounts of it, too. */
+ r = mount_nofollow_verbose(LOG_DEBUG, "/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_private_apivfs(
+ const char *fstype,
+ const char *entry_path,
+ const char *bind_source,
+ const char *opts,
+ RuntimeScope scope) {
+
+ _cleanup_(rmdir_and_freep) char *temporary_mount = NULL;
+ int r;
+
+ assert(fstype);
+ assert(entry_path);
+ assert(bind_source);
+
+ (void) mkdir_p_label(entry_path, 0755);
+
+ /* First, check if we have enough privileges to mount a new instance. Note, a new sysfs instance
+ * cannot be mounted on an already existing mount. Let's use a temporary place. */
+ r = create_temporary_mount_point(scope, &temporary_mount);
+ if (r < 0)
+ return r;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ if (r == -EINVAL && opts)
+ /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
+ * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
+ * really don't want to use it, since it would affect our host's /proc mount. Hence let's
+ * gracefully fallback to a classic, unrestricted version. */
+ r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
+ if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+ /* When we do not have enough privileges to mount a new instance, fall back to use an
+ * existing mount. */
+
+ r = path_is_mount_point(entry_path, /* root = */ NULL, /* flags = */ 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether '%s' is already mounted: %m", entry_path);
+ if (r > 0)
+ return 0; /* Use the current mount as is. */
+
+ /* We lack permissions to mount a new instance, and it is not already mounted. But we can
+ * access the host's, so as a final fallback bind-mount it to the destination, as most likely
+ * we are inside a user manager in an unprivileged user namespace. */
+ r = mount_nofollow_verbose(LOG_DEBUG, bind_source, entry_path, /* fstype = */ NULL, MS_BIND|MS_REC, /* opts = */ NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ } else if (r < 0)
+ return r;
+
+ /* OK. We have a new mount instance. Let's clear an existing mount and its submounts. */
+ r = umount_recursive(entry_path, /* flags = */ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", entry_path);
+
+ /* Then, move the new mount instance. */
+ r = mount_nofollow_verbose(LOG_DEBUG, temporary_mount, entry_path, /* fstype = */ NULL, MS_MOVE, /* opts = */ NULL);
+ if (r < 0)
+ return r;
+
+ /* We mounted a new instance now. Let's bind mount the children over now. This matters for nspawn
+ * where a bunch of files are overmounted, in particular the boot id. */
+ (void) bind_mount_submounts(bind_source, entry_path);
+ return 1;
+}
+
+static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p) {
+ assert(m);
+ assert(p);
+ return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
+}
+
+static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
+ _cleanup_free_ char *opts = NULL;
+
+ assert(m);
+ assert(p);
+
+ if (p->protect_proc != PROTECT_PROC_DEFAULT ||
+ p->proc_subset != PROC_SUBSET_ALL) {
+
+ /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
+ * pretended to be per-instance but actually was per-namespace), hence let's make use of it
+ * if requested. To make sure this logic succeeds only on kernels where hidepid= is
+ * per-instance, we'll exclusively use the textual value for hidepid=, since support was
+ * added in the same commit: if it's supported it is thus also per-instance. */
+
+ const char *hpv = p->protect_proc == PROTECT_PROC_DEFAULT ?
+ "off" :
+ protect_proc_to_string(p->protect_proc);
+
+ /* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
+ * 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
+ * trying to use hidepid= on systems where it isn't supported. The same applies for subset=.
+ * fsopen()/fsconfig() was also backported on some distros which allows us to detect
+ * hidepid=/subset= support in even more scenarios. */
+
+ if (mount_option_supported("proc", "hidepid", hpv) != 0) {
+ opts = strjoin("hidepid=", hpv);
+ if (!opts)
+ return -ENOMEM;
+ }
+
+ if (p->proc_subset == PROC_SUBSET_PID &&
+ mount_option_supported("proc", "subset", "pid") != 0)
+ if (!strextend_with_separator(&opts, ",", "subset=pid"))
+ return -ENOMEM;
+ }
+
+ /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
+ * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
+ * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
+ * mounted on /proc/ first. */
+ return mount_private_apivfs("proc", mount_entry_path(m), "/proc", opts, p->runtime_scope);
+}
+
+static int mount_tmpfs(const MountEntry *m) {
+ const char *entry_path, *inner_path;
+ int r;
+
+ assert(m);
+
+ entry_path = mount_entry_path(m);
+ inner_path = mount_entry_unprefixed_path(m);
+
+ /* First, get rid of everything that is below if there is anything. Then, overmount with our new
+ * tmpfs */
+
+ (void) mkdir_p_label(entry_path, 0755);
+ (void) umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", entry_path, "tmpfs", m->flags, mount_entry_options(m));
+ if (r < 0)
+ return r;
+
+ r = label_fix_full(AT_FDCWD, entry_path, inner_path, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fix label of '%s' as '%s': %m", entry_path, inner_path);
+
+ return 1;
+}
+
+static int mount_run(const MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Unable to determine whether /run is already mounted: %m");
+ if (r > 0) /* make this a NOP if /run is already a mount point */
+ return 0;
+
+ return mount_tmpfs(m);
+}
+
+static int mount_mqueuefs(const MountEntry *m) {
+ int r;
+ const char *entry_path;
+
+ assert(m);
+
+ entry_path = mount_entry_path(m);
+
+ (void) mkdir_p_label(entry_path, 0755);
+ (void) umount_recursive(entry_path, 0);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "mqueue", entry_path, "mqueue", m->flags, mount_entry_options(m));
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_image(
+ const MountEntry *m,
+ const char *root_directory,
+ const ImagePolicy *image_policy) {
+
+ _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+ *host_os_release_sysext_level = NULL, *host_os_release_confext_level = NULL,
+ *extension_name = NULL;
+ int r;
+
+ assert(m);
+
+ r = path_extract_filename(mount_entry_source(m), &extension_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
+ if (m->mode == MOUNT_EXTENSION_IMAGE) {
+ r = parse_os_release(
+ empty_to_root(root_directory),
+ "ID", &host_os_release_id,
+ "VERSION_ID", &host_os_release_version_id,
+ image_class_info[IMAGE_SYSEXT].level_env, &host_os_release_sysext_level,
+ image_class_info[IMAGE_CONFEXT].level_env, &host_os_release_confext_level,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ if (isempty(host_os_release_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ }
+
+ r = verity_dissect_and_mount(
+ /* src_fd= */ -1,
+ mount_entry_source(m),
+ mount_entry_path(m),
+ m->image_options_const,
+ image_policy,
+ host_os_release_id,
+ host_os_release_version_id,
+ host_os_release_sysext_level,
+ host_os_release_confext_level,
+ /* required_sysext_scope= */ NULL,
+ /* ret_image= */ NULL);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r == -ESTALE && host_os_release_id)
+ return log_error_errno(r, // FIXME: this should not be logged ad LOG_ERR, as it will result in duplicate logging.
+ "Failed to mount image %s, extension-release metadata does not match the lower layer's: ID=%s%s%s%s%s%s%s",
+ mount_entry_source(m),
+ host_os_release_id,
+ host_os_release_version_id ? " VERSION_ID=" : "",
+ strempty(host_os_release_version_id),
+ host_os_release_sysext_level ? image_class_info[IMAGE_SYSEXT].level_env_print : "",
+ strempty(host_os_release_sysext_level),
+ host_os_release_confext_level ? image_class_info[IMAGE_CONFEXT].level_env_print : "",
+ strempty(host_os_release_confext_level));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount image %s on %s: %m", mount_entry_source(m), mount_entry_path(m));
+
+ return 1;
+}
+
+static int mount_overlay(const MountEntry *m) {
+ _cleanup_free_ char *options = NULL, *layers = NULL;
+ int r;
+
+ assert(m);
+
+ /* Extension hierarchies are optional (e.g.: confext might not have /opt) so check if they actually
+ * exist in an image before attempting to create an overlay with them, otherwise the mount will
+ * fail. We can't check before this, as the images will not be mounted until now. */
+
+ /* Note that lowerdir= parameters are in 'reverse' order, so the top-most directory in the overlay
+ * comes first in the list. */
+ STRV_FOREACH_BACKWARDS(o, m->overlay_layers) {
+ _cleanup_free_ char *escaped = NULL;
+
+ r = is_dir(*o, /* follow= */ false);
+ if (r <= 0) {
+ if (r != -ENOENT)
+ log_debug_errno(r,
+ "Failed to check whether overlay layer source path '%s' exists, ignoring: %m",
+ *o);
+ continue;
+ }
+
+ escaped = shell_escape(*o, ",:");
+ if (!escaped)
+ return log_oom_debug();
+
+ if (!strextend_with_separator(&layers, ":", escaped))
+ return log_oom_debug();
+ }
+
+ if (!layers) {
+ log_debug("None of the overlays specified in '%s' exist at the source, skipping.",
+ mount_entry_options(m));
+ return 0; /* Only the root is set? Then there's nothing to overlay */
+ }
+
+ options = strjoin("lowerdir=", layers, ":", mount_entry_path(m)); /* The root goes in last */
+ if (!options)
+ return log_oom_debug();
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "overlay", mount_entry_path(m), "overlay", MS_RDONLY, options);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int follow_symlink(
+ const char *root_directory,
+ MountEntry *m) {
+
+ _cleanup_free_ char *target = NULL;
+ int r;
+
+ /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
+ * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
+ * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
+ * end and already have a fully normalized name. */
+
+ r = chase(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
+ if (r > 0) /* Reached the end, nothing more to resolve */
+ return 1;
+
+ if (m->n_followed >= CHASE_MAX) /* put a boundary on things */
+ return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+ "Symlink loop on '%s'.",
+ mount_entry_path(m));
+
+ log_debug("Followed mount entry path symlink %s %s %s.",
+ mount_entry_path(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), target);
+
+ mount_entry_consume_prefix(m, TAKE_PTR(target));
+
+ m->n_followed ++;
+
+ return 0;
+}
+
+static int apply_one_mount(
+ const char *root_directory,
+ MountEntry *m,
+ const NamespaceParameters *p) {
+
+ _cleanup_free_ char *inaccessible = NULL;
+ bool rbind = true, make = false;
+ const char *what;
+ int r;
+
+ /* Return 1 when the mount should be post-processed (remounted r/o, etc.), 0 otherwise. In most
+ * cases post-processing is the right thing, the typical exception is when the mount is gracefully
+ * skipped. */
+
+ assert(m);
+ assert(p);
+
+ log_debug("Applying namespace mount on %s", mount_entry_path(m));
+
+ switch (m->mode) {
+
+ case MOUNT_INACCESSIBLE: {
+ _cleanup_free_ char *runtime_dir = NULL;
+ struct stat target;
+
+ /* First, get rid of everything that is below if there
+ * is anything... Then, overmount it with an
+ * inaccessible path. */
+ (void) umount_recursive(mount_entry_path(m), 0);
+
+ if (lstat(mount_entry_path(m), &target) < 0) {
+ if (errno == ENOENT && m->ignore)
+ return 0;
+
+ return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m",
+ mount_entry_path(m));
+ }
+
+ /* We don't pass the literal runtime scope through here but one based purely on our UID. This
+ * means that the root user's --user services will use the host's inaccessible inodes rather
+ * then root's private ones. This is preferable since it means device nodes that are
+ * overmounted to make them inaccessible will be overmounted with a device node, rather than
+ * an AF_UNIX socket inode. */
+ runtime_dir = settle_runtime_dir(geteuid() == 0 ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER);
+ if (!runtime_dir)
+ return log_oom_debug();
+
+ r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
+ "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
+ what = inaccessible;
+ break;
+ }
+
+ case MOUNT_READ_ONLY:
+ case MOUNT_READ_WRITE:
+ case MOUNT_READ_WRITE_IMPLICIT:
+ case MOUNT_EXEC:
+ case MOUNT_NOEXEC:
+ r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m",
+ mount_entry_path(m));
+ if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY
+ * and MS_NOEXEC bits for the mount point if needed. */
+ return 1;
+ /* This isn't a mount point yet, let's make it one. */
+ what = mount_entry_path(m);
+ break;
+
+ case MOUNT_EXTENSION_DIRECTORY: {
+ _cleanup_free_ char *host_os_release_id = NULL, *host_os_release_version_id = NULL,
+ *host_os_release_level = NULL, *extension_name = NULL;
+ _cleanup_strv_free_ char **extension_release = NULL;
+ ImageClass class = IMAGE_SYSEXT;
+
+ r = path_extract_filename(mount_entry_source(m), &extension_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to extract extension name from %s: %m", mount_entry_source(m));
+
+ r = load_extension_release_pairs(mount_entry_source(m), IMAGE_SYSEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+ if (r == -ENOENT) {
+ r = load_extension_release_pairs(mount_entry_source(m), IMAGE_CONFEXT, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+ if (r >= 0)
+ class = IMAGE_CONFEXT;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire 'extension-release' data of extension tree %s: %m", mount_entry_source(m));
+
+ r = parse_os_release(
+ empty_to_root(root_directory),
+ "ID", &host_os_release_id,
+ "VERSION_ID", &host_os_release_version_id,
+ image_class_info[class].level_env, &host_os_release_level,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+ if (isempty(host_os_release_id))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'ID' field not found or empty in 'os-release' data of OS tree '%s': %m", empty_to_root(root_directory));
+
+ r = load_extension_release_pairs(mount_entry_source(m), class, extension_name, /* relax_extension_release_check= */ false, &extension_release);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse directory %s extension-release metadata: %m", extension_name);
+
+ r = extension_release_validate(
+ extension_name,
+ host_os_release_id,
+ host_os_release_version_id,
+ host_os_release_level,
+ /* host_extension_scope */ NULL, /* Leave empty, we need to accept both system and portable */
+ extension_release,
+ class);
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Directory %s extension-release metadata does not match the root's", extension_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to compare directory %s extension-release metadata with the root's os-release: %m", extension_name);
+
+ _fallthrough_;
+ }
+
+ case MOUNT_BIND:
+ rbind = false;
+
+ _fallthrough_;
+ case MOUNT_BIND_RECURSIVE: {
+ _cleanup_free_ char *chased = NULL;
+
+ /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note
+ * that bind mount source paths are always relative to the host root, hence we pass NULL as
+ * root directory to chase() here. */
+
+ r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL);
+ if (r == -ENOENT && m->ignore) {
+ log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
+ return 0;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
+
+ log_debug("Followed source symlinks %s %s %s.",
+ mount_entry_source(m), special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), chased);
+
+ free_and_replace(m->source_malloc, chased);
+
+ what = mount_entry_source(m);
+ make = true;
+ break;
+ }
+
+ case MOUNT_EMPTY_DIR:
+ case MOUNT_TMPFS:
+ return mount_tmpfs(m);
+
+ case MOUNT_PRIVATE_TMP:
+ case MOUNT_PRIVATE_TMP_READ_ONLY:
+ what = mount_entry_source(m);
+ make = true;
+ break;
+
+ case MOUNT_PRIVATE_DEV:
+ return mount_private_dev(m, p->runtime_scope);
+
+ case MOUNT_BIND_DEV:
+ return mount_bind_dev(m);
+
+ case MOUNT_PRIVATE_SYSFS:
+ return mount_private_sysfs(m, p);
+
+ case MOUNT_BIND_SYSFS:
+ return mount_bind_sysfs(m);
+
+ case MOUNT_PROCFS:
+ return mount_procfs(m, p);
+
+ case MOUNT_RUN:
+ return mount_run(m);
+
+ case MOUNT_MQUEUEFS:
+ return mount_mqueuefs(m);
+
+ case MOUNT_IMAGE:
+ return mount_image(m, NULL, p->mount_image_policy);
+
+ case MOUNT_EXTENSION_IMAGE:
+ return mount_image(m, root_directory, p->extension_image_policy);
+
+ case MOUNT_OVERLAY:
+ return mount_overlay(m);
+
+ default:
+ assert_not_reached();
+ }
+
+ assert(what);
+
+ r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+ if (r < 0) {
+ bool try_again = false;
+
+ if (r == -ENOENT && make) {
+ int q;
+
+ /* Hmm, either the source or the destination are missing. Let's see if we can create
+ the destination, then try again. */
+
+ (void) mkdir_parents(mount_entry_path(m), 0755);
+
+ q = make_mount_point_inode_from_path(what, mount_entry_path(m), 0755);
+ if (q < 0) {
+ if (q != -EEXIST) // FIXME: this shouldn't be logged at LOG_WARNING, but be bubbled up, and logged there to avoid duplicate logging
+ log_warning_errno(q, "Failed to create destination mount point node '%s', ignoring: %m",
+ mount_entry_path(m));
+ } else
+ try_again = true;
+ }
+
+ if (try_again)
+ r = mount_nofollow_verbose(LOG_DEBUG, what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m)); // FIXME: this should not be logged here, but be bubbled up, to avoid duplicate logging
+ }
+
+ log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
+ return 1;
+}
+
+static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+ unsigned long new_flags = 0, flags_mask = 0;
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ if (m->state != MOUNT_APPLIED)
+ return 0;
+
+ if (mount_entry_read_only(m) || m->mode == MOUNT_PRIVATE_DEV) {
+ new_flags |= MS_RDONLY;
+ flags_mask |= MS_RDONLY;
+ }
+
+ if (m->nosuid) {
+ new_flags |= MS_NOSUID;
+ flags_mask |= MS_NOSUID;
+ }
+
+ if (flags_mask == 0) /* No Change? */
+ return 0;
+
+ /* We generally apply these changes recursively, except for /dev, and the cases we know there's
+ * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
+ * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
+ * and running Linux <= 4.17. */
+ submounts =
+ mount_entry_read_only(m) &&
+ !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+ /* Note that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
+ * read-only already stays this way. This improves compatibility with container managers, where we
+ * won't attempt to undo read-only mounts already applied. */
+
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mountinfo) {
+ unsigned long new_flags = 0, flags_mask = 0;
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ if (m->state != MOUNT_APPLIED)
+ return 0;
+
+ if (mount_entry_noexec(m)) {
+ new_flags |= MS_NOEXEC;
+ flags_mask |= MS_NOEXEC;
+ } else if (mount_entry_exec(m)) {
+ new_flags &= ~MS_NOEXEC;
+ flags_mask |= MS_NOEXEC;
+ }
+
+ if (flags_mask == 0) /* No Change? */
+ return 0;
+
+ submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, proc_self_mountinfo);
+
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) {
+ bool submounts;
+ int r;
+
+ assert(m);
+ assert(proc_self_mountinfo);
+
+ if (m->state != MOUNT_APPLIED)
+ return 0;
+
+ submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS);
+ if (submounts)
+ r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo);
+ else
+ r = bind_remount_one_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, proc_self_mountinfo);
+ if (r == -ENOENT && m->ignore)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
+ submounts ? " and its submounts" : "");
+ return 0;
+}
+
+static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
+ assert(p);
+
+ /*
+ * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
+ * since to protect the API VFS mounts, they need to be around in the
+ * first place...
+ */
+
+ return p->mount_apivfs ||
+ p->protect_control_groups ||
+ p->protect_kernel_tunables ||
+ p->protect_proc != PROTECT_PROC_DEFAULT ||
+ p->proc_subset != PROC_SUBSET_ALL;
+}
+
+/* Walk all mount entries and dropping any unused mounts. This affects all
+ * mounts:
+ * - that are implicitly protected by a path that has been rendered inaccessible
+ * - whose immediate parent requests the same protection mode as the mount itself
+ * - that are outside of the relevant root directory
+ * - which are duplicates
+ */
+static void drop_unused_mounts(MountList *ml, const char *root_directory) {
+ assert(ml);
+ assert(root_directory);
+
+ assert(ml->mounts || ml->n_mounts == 0);
+
+ typesafe_qsort(ml->mounts, ml->n_mounts, mount_path_compare);
+
+ drop_duplicates(ml);
+ drop_outside_root(ml, root_directory);
+ drop_inaccessible(ml);
+ drop_nop(ml);
+}
+
+static int create_symlinks_from_tuples(const char *root, char **strv_symlinks) {
+ int r;
+
+ STRV_FOREACH_PAIR(src, dst, strv_symlinks) {
+ _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
+
+ src_abs = path_join(root, *src);
+ dst_abs = path_join(root, *dst);
+ if (!src_abs || !dst_abs)
+ return -ENOMEM;
+
+ r = mkdir_parents_label(dst_abs, 0755);
+ if (r < 0)
+ return log_debug_errno(
+ r,
+ "Failed to create parent directory for symlink '%s': %m",
+ dst_abs);
+
+ r = symlink_idempotent(src_abs, dst_abs, true);
+ if (r < 0)
+ return log_debug_errno(
+ r,
+ "Failed to create symlink from '%s' to '%s': %m",
+ src_abs,
+ dst_abs);
+ }
+
+ return 0;
+}
+
+static void mount_entry_path_debug_string(const char *root, MountEntry *m, char **error_path) {
+ assert(m);
+
+ /* Create a string suitable for debugging logs, stripping for example the local working directory.
+ * For example, with a BindPaths=/var/bar that does not exist on the host:
+ *
+ * Before:
+ * foo.service: Failed to set up mount namespacing: /run/systemd/unit-root/var/bar: No such file or directory
+ * After:
+ * foo.service: Failed to set up mount namespacing: /var/bar: No such file or directory
+ *
+ * Note that this is an error path, so no OOM check is done on purpose. */
+
+ if (!error_path)
+ return;
+
+ if (!mount_entry_path(m)) {
+ *error_path = NULL;
+ return;
+ }
+
+ if (root) {
+ const char *e = startswith(mount_entry_path(m), root);
+ if (e) {
+ *error_path = strdup(e);
+ return;
+ }
+ }
+
+ *error_path = strdup(mount_entry_path(m));
+ return;
+}
+
+static int apply_mounts(
+ MountList *ml,
+ const char *root,
+ const NamespaceParameters *p,
+ char **error_path) {
+
+ _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
+ _cleanup_free_ char **deny_list = NULL;
+ int r;
+
+ assert(ml);
+ assert(root);
+ assert(p);
+
+ if (ml->n_mounts == 0) /* Shortcut: nothing to do */
+ return 0;
+
+ /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of
+ * /proc. For example, this is the case with the option: 'InaccessiblePaths=/proc'. */
+ proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
+ if (!proc_self_mountinfo) {
+ r = -errno;
+
+ if (error_path)
+ *error_path = strdup("/proc/self/mountinfo");
+
+ return log_debug_errno(r, "Failed to open /proc/self/mountinfo: %m");
+ }
+
+ /* First round, establish all mounts we need */
+ for (;;) {
+ bool again = false;
+
+ FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+
+ if (m->state != MOUNT_PENDING)
+ continue;
+
+ /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */
+ r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m);
+ if (r < 0) {
+ mount_entry_path_debug_string(root, m, error_path);
+ return r;
+ }
+ if (r == 0) {
+ /* We hit a symlinked mount point. The entry got rewritten and might
+ * point to a very different place now. Let's normalize the changed
+ * list, and start from the beginning. After all to mount the entry
+ * at the new location we might need some other mounts first */
+ again = true;
+ break;
+ }
+
+ /* Returns 1 if the mount should be post-processed, 0 otherwise */
+ r = apply_one_mount(root, m, p);
+ if (r < 0) {
+ mount_entry_path_debug_string(root, m, error_path);
+ return r;
+ }
+ m->state = r == 0 ? MOUNT_SKIPPED : MOUNT_APPLIED;
+ }
+
+ if (!again)
+ break;
+
+ drop_unused_mounts(ml, root);
+ }
+
+ /* Now that all filesystems have been set up, but before the
+ * read-only switches are flipped, create the exec dirs and other symlinks.
+ * Note that when /var/lib is not empty/tmpfs, these symlinks will already
+ * exist, which means this will be a no-op. */
+ r = create_symlinks_from_tuples(root, p->symlinks);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
+
+ /* Create a deny list we can pass to bind_mount_recursive() */
+ deny_list = new(char*, ml->n_mounts+1);
+ if (!deny_list)
+ return -ENOMEM;
+ for (size_t j = 0; j < ml->n_mounts; j++)
+ deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
+ deny_list[ml->n_mounts] = NULL;
+
+ /* Second round, flip the ro bits if necessary. */
+ FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+ r = make_read_only(m, deny_list, proc_self_mountinfo);
+ if (r < 0) {
+ mount_entry_path_debug_string(root, m, error_path);
+ return r;
+ }
+ }
+
+ /* Third round, flip the noexec bits with a simplified deny list. */
+ for (size_t j = 0; j < ml->n_mounts; j++)
+ if (IN_SET((ml->mounts+j)->mode, MOUNT_EXEC, MOUNT_NOEXEC))
+ deny_list[j] = (char*) mount_entry_path(ml->mounts+j);
+ deny_list[ml->n_mounts] = NULL;
+
+ FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+ r = make_noexec(m, deny_list, proc_self_mountinfo);
+ if (r < 0) {
+ mount_entry_path_debug_string(root, m, error_path);
+ return r;
+ }
+ }
+
+ /* Fourth round, flip the nosuid bits without a deny list. */
+ if (p->mount_nosuid)
+ FOREACH_ARRAY(m, ml->mounts, ml->n_mounts) {
+ r = make_nosuid(m, proc_self_mountinfo);
+ if (r < 0) {
+ mount_entry_path_debug_string(root, m, error_path);
+ return r;
+ }
+ }
+
+ return 1;
+}
+
+static bool root_read_only(
+ char **read_only_paths,
+ ProtectSystem protect_system) {
+
+ /* Determine whether the root directory is going to be read-only given the configured settings. */
+
+ if (protect_system == PROTECT_SYSTEM_STRICT)
+ return true;
+
+ if (prefixed_path_strv_contains(read_only_paths, "/"))
+ return true;
+
+ return false;
+}
+
+static bool home_read_only(
+ char** read_only_paths,
+ char** inaccessible_paths,
+ char** empty_directories,
+ const BindMount *bind_mounts,
+ size_t n_bind_mounts,
+ const TemporaryFileSystem *temporary_filesystems,
+ size_t n_temporary_filesystems,
+ ProtectHome protect_home) {
+
+ /* Determine whether the /home directory is going to be read-only given the configured settings. Yes,
+ * this is a bit sloppy, since we don't bother checking for cases where / is affected by multiple
+ * settings. */
+
+ if (protect_home != PROTECT_HOME_NO)
+ return true;
+
+ if (prefixed_path_strv_contains(read_only_paths, "/home") ||
+ prefixed_path_strv_contains(inaccessible_paths, "/home") ||
+ prefixed_path_strv_contains(empty_directories, "/home"))
+ return true;
+
+ for (size_t i = 0; i < n_temporary_filesystems; i++)
+ if (path_equal(temporary_filesystems[i].path, "/home"))
+ return true;
+
+ /* If /home is overmounted with some dir from the host it's not writable. */
+ for (size_t i = 0; i < n_bind_mounts; i++)
+ if (path_equal(bind_mounts[i].destination, "/home"))
+ return true;
+
+ return false;
+}
+
+int setup_namespace(const NamespaceParameters *p, char **error_path) {
+
+ _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ _cleanup_strv_free_ char **hierarchies = NULL;
+ _cleanup_(mount_list_done) MountList ml = {};
+ bool require_prefix = false;
+ const char *root;
+ DissectImageFlags dissect_image_flags =
+ DISSECT_IMAGE_GENERIC_ROOT |
+ DISSECT_IMAGE_REQUIRE_ROOT |
+ DISSECT_IMAGE_DISCARD_ON_LOOP |
+ DISSECT_IMAGE_RELAX_VAR_CHECK |
+ DISSECT_IMAGE_FSCK |
+ DISSECT_IMAGE_USR_NO_ROOT |
+ DISSECT_IMAGE_GROWFS |
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+ int r;
+
+ assert(p);
+
+ /* Make sure that all mknod(), mkdir() calls we do are unaffected by the umask, and the access modes
+ * we configure take effect */
+ BLOCK_WITH_UMASK(0000);
+
+ bool setup_propagate = !isempty(p->propagate_dir) && !isempty(p->incoming_dir);
+ unsigned long mount_propagation_flag = p->mount_propagation_flag != 0 ? p->mount_propagation_flag : MS_SHARED;
+
+ if (p->root_image) {
+ /* Make the whole image read-only if we can determine that we only access it in a read-only fashion. */
+ if (root_read_only(p->read_only_paths,
+ p->protect_system) &&
+ home_read_only(p->read_only_paths, p->inaccessible_paths, p->empty_directories,
+ p->bind_mounts, p->n_bind_mounts, p->temporary_filesystems, p->n_temporary_filesystems,
+ p->protect_home) &&
+ strv_isempty(p->read_write_paths))
+ dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+
+ SET_FLAG(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE, p->verity && p->verity->data_path);
+
+ r = loop_device_make_by_path(
+ p->root_image,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
+ /* sector_size= */ UINT32_MAX,
+ FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop_device);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create loop device for root image: %m");
+
+ r = dissect_loop_device(
+ loop_device,
+ p->verity,
+ p->root_image_options,
+ p->root_image_policy,
+ dissect_image_flags,
+ &dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to dissect image: %m");
+
+ r = dissected_image_load_verity_sig_partition(
+ dissected_image,
+ loop_device->fd,
+ p->verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(
+ dissected_image,
+ NULL,
+ p->verity,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+ }
+
+ if (p->root_directory)
+ root = p->root_directory;
+ else {
+ /* /run/systemd should have been created by PID 1 early on already, but in some cases, like
+ * when running tests (test-execute), it might not have been created yet so let's make sure
+ * we create it if it doesn't already exist. */
+ (void) mkdir_p_label("/run/systemd", 0755);
+
+ /* Always create the mount namespace in a temporary directory, instead of operating directly
+ * in the root. The temporary directory prevents any mounts from being potentially obscured
+ * my other mounts we already applied. We use the same mount point for all images, which is
+ * safe, since they all live in their own namespaces after all, and hence won't see each
+ * other. (Note: this directory is also created by PID 1 early on, we create it here for
+ * similar reasons as /run/systemd/ first.) */
+ root = "/run/systemd/mount-rootfs";
+ (void) mkdir_label(root, 0555);
+
+ require_prefix = true;
+ }
+
+ if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) {
+ /* Hierarchy population needs to be done for sysext and confext extension images */
+ r = parse_env_extension_hierarchies(&hierarchies, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES");
+ if (r < 0)
+ return r;
+ }
+
+ r = append_access_mounts(&ml, p->read_write_paths, MOUNT_READ_WRITE, require_prefix);
+ if (r < 0)
+ return r;
+
+ r = append_access_mounts(&ml, p->read_only_paths, MOUNT_READ_ONLY, require_prefix);
+ if (r < 0)
+ return r;
+
+ r = append_access_mounts(&ml, p->inaccessible_paths, MOUNT_INACCESSIBLE, require_prefix);
+ if (r < 0)
+ return r;
+
+ r = append_access_mounts(&ml, p->exec_paths, MOUNT_EXEC, require_prefix);
+ if (r < 0)
+ return r;
+
+ r = append_access_mounts(&ml, p->no_exec_paths, MOUNT_NOEXEC, require_prefix);
+ if (r < 0)
+ return r;
+
+ r = append_empty_dir_mounts(&ml, p->empty_directories);
+ if (r < 0)
+ return r;
+
+ r = append_bind_mounts(&ml, p->bind_mounts, p->n_bind_mounts);
+ if (r < 0)
+ return r;
+
+ r = append_tmpfs_mounts(&ml, p->temporary_filesystems, p->n_temporary_filesystems);
+ if (r < 0)
+ return r;
+
+ if (p->tmp_dir) {
+ bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY);
+
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/tmp",
+ .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+ .source_const = p->tmp_dir,
+ };
+ }
+
+ if (p->var_tmp_dir) {
+ bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY);
+
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/var/tmp",
+ .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP,
+ .source_const = p->var_tmp_dir,
+ };
+ }
+
+ r = append_mount_images(&ml, p->mount_images, p->n_mount_images);
+ if (r < 0)
+ return r;
+
+ r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories);
+ if (r < 0)
+ return r;
+
+ if (p->private_dev) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/dev",
+ .mode = MOUNT_PRIVATE_DEV,
+ .flags = DEV_MOUNT_OPTIONS,
+ };
+ }
+
+ /* In case /proc is successfully mounted with pid tree subset only (ProcSubset=pid), the protective
+ mounts to non-pid /proc paths would fail. But the pid only option may have failed gracefully, so
+ let's try the mounts but it's not fatal if they don't succeed. */
+ bool ignore_protect_proc = p->ignore_protect_paths || p->proc_subset == PROC_SUBSET_PID;
+ if (p->protect_kernel_tunables) {
+ r = append_static_mounts(&ml,
+ protect_kernel_tunables_proc_table,
+ ELEMENTSOF(protect_kernel_tunables_proc_table),
+ ignore_protect_proc);
+ if (r < 0)
+ return r;
+
+ r = append_static_mounts(&ml,
+ protect_kernel_tunables_sys_table,
+ ELEMENTSOF(protect_kernel_tunables_sys_table),
+ p->ignore_protect_paths);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->protect_kernel_modules) {
+ r = append_static_mounts(&ml,
+ protect_kernel_modules_table,
+ ELEMENTSOF(protect_kernel_modules_table),
+ p->ignore_protect_paths);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->protect_kernel_logs) {
+ r = append_static_mounts(&ml,
+ protect_kernel_logs_proc_table,
+ ELEMENTSOF(protect_kernel_logs_proc_table),
+ ignore_protect_proc);
+ if (r < 0)
+ return r;
+
+ r = append_static_mounts(&ml,
+ protect_kernel_logs_dev_table,
+ ELEMENTSOF(protect_kernel_logs_dev_table),
+ p->ignore_protect_paths);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->protect_control_groups) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/sys/fs/cgroup",
+ .mode = MOUNT_READ_ONLY,
+ };
+ }
+
+ r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
+ if (r < 0)
+ return r;
+
+ r = append_protect_system(&ml, p->protect_system, false);
+ if (r < 0)
+ return r;
+
+ if (namespace_parameters_mount_apivfs(p)) {
+ r = append_static_mounts(&ml,
+ apivfs_table,
+ ELEMENTSOF(apivfs_table),
+ p->ignore_protect_paths);
+ if (r < 0)
+ return r;
+ }
+
+ /* Note, if proc is mounted with subset=pid then neither of the two paths will exist, i.e. they are
+ * implicitly protected by the mount option. */
+ if (p->protect_hostname) {
+ r = append_static_mounts(
+ &ml,
+ protect_hostname_table,
+ ELEMENTSOF(protect_hostname_table),
+ ignore_protect_proc);
+ if (r < 0)
+ return r;
+ }
+
+ if (p->private_network) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/sys",
+ .mode = MOUNT_PRIVATE_SYSFS,
+ };
+ }
+
+ if (p->private_ipc) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/dev/mqueue",
+ .mode = MOUNT_MQUEUEFS,
+ .flags = MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+ };
+ }
+
+ if (p->creds_path) {
+ /* If our service has a credentials store configured, then bind that one in, but hide
+ * everything else. */
+
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/run/credentials",
+ .mode = MOUNT_TMPFS,
+ .read_only = true,
+ .options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
+ .flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
+ };
+
+ me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = p->creds_path,
+ .mode = MOUNT_BIND,
+ .read_only = true,
+ .source_const = p->creds_path,
+ .ignore = true,
+ };
+ } else {
+ /* If our service has no credentials store configured, then make the whole credentials tree
+ * inaccessible wholesale. */
+
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/run/credentials",
+ .mode = MOUNT_INACCESSIBLE,
+ .ignore = true,
+ };
+ }
+
+ if (p->log_namespace) {
+ _cleanup_free_ char *q = NULL;
+
+ q = strjoin("/run/systemd/journal.", p->log_namespace);
+ if (!q)
+ return log_oom_debug();
+
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/run/systemd/journal",
+ .mode = MOUNT_BIND_RECURSIVE,
+ .read_only = true,
+ .source_malloc = TAKE_PTR(q),
+ };
+ }
+
+ /* Will be used to add bind mounts at runtime */
+ if (setup_propagate) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .source_const = p->propagate_dir,
+ .path_const = p->incoming_dir,
+ .mode = MOUNT_BIND,
+ .read_only = true,
+ };
+ }
+
+ if (p->notify_socket) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = p->notify_socket,
+ .source_const = p->notify_socket,
+ .mode = MOUNT_BIND,
+ .read_only = true,
+ };
+ }
+
+ if (p->host_os_release_stage) {
+ MountEntry *me = mount_list_extend(&ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/run/host/.os-release-stage/",
+ .source_const = p->host_os_release_stage,
+ .mode = MOUNT_BIND,
+ .read_only = true,
+ .ignore = true, /* Live copy, don't hard-fail if it goes missing */
+ };
+ }
+
+ /* Prepend the root directory where that's necessary */
+ r = prefix_where_needed(&ml, root);
+ if (r < 0)
+ return r;
+
+ drop_unused_mounts(&ml, root);
+
+ /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
+
+ if (unshare(CLONE_NEWNS) < 0) {
+ r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
+
+ if (ERRNO_IS_PRIVILEGE(r) ||
+ ERRNO_IS_NOT_SUPPORTED(r))
+ /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter
+ * in place that doesn't allow us to create namespaces (or a missing cap), then
+ * propagate a recognizable error back, which the caller can use to detect this case
+ * (and only this) and optionally continue without namespacing applied. */
+ return -ENOANO;
+
+ return r;
+ }
+
+ /* Create the source directory to allow runtime propagation of mounts */
+ if (setup_propagate)
+ (void) mkdir_p(p->propagate_dir, 0600);
+
+ if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories))
+ /* ExtensionImages/Directories mountpoint directories will be created while parsing the
+ * mounts to create, so have the parent ready */
+ (void) mkdir_p(p->extension_dir, 0600);
+
+ /* Remount / as SLAVE so that nothing now mounted in the namespace
+ * shows up in the parent */
+ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
+ return log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
+
+ if (p->root_image) {
+ /* A root image is specified, mount it to the right place */
+ r = dissected_image_mount(
+ dissected_image,
+ root,
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ /* userns_fd= */ -EBADF,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount root image: %m");
+
+ /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
+ * if it likes. */
+ r = loop_device_flock(loop_device, LOCK_UN);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to release lock on loopback block device: %m");
+
+ r = dissected_image_relinquish(dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to relinquish dissected image: %m");
+
+ } else if (p->root_directory) {
+
+ /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
+ r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
+ if (r == 0) {
+ r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ } else {
+ /* Let's mount the main root directory to the root directory to use */
+ r = mount_nofollow_verbose(LOG_DEBUG, "/", root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ /* Try to set up the new root directory before mounting anything else there. */
+ if (p->root_image || p->root_directory)
+ (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
+
+ /* Now make the magic happen */
+ r = apply_mounts(&ml, root, p, error_path);
+ if (r < 0)
+ return r;
+
+ /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
+ r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+ if (r == -EINVAL && p->root_directory) {
+ /* If we are using root_directory and we don't have privileges (ie: user manager in a user
+ * namespace) and the root_directory is already a mount point in the parent namespace,
+ * MS_MOVE will fail as we don't have permission to change it (with EINVAL rather than
+ * EPERM). Attempt to bind-mount it over itself (like we do above if it's not already a
+ * mount point) and try again. */
+ r = mount_nofollow_verbose(LOG_DEBUG, root, root, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ r = mount_switch_root(root, /* mount_propagation_flag = */ 0);
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
+
+ /* Remount / as the desired mode. Note that this will not reestablish propagation from our side to
+ * the host, since what's disconnected is disconnected. */
+ if (mount(NULL, "/", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
+ return log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
+
+ /* bind_mount_in_namespace() will MS_MOVE into that directory, and that's only supported for
+ * non-shared mounts. This needs to happen after remounting / or it will fail. */
+ if (setup_propagate && mount(NULL, p->incoming_dir, NULL, MS_SLAVE, NULL) < 0)
+ return log_debug_errno(errno, "Failed to remount %s with MS_SLAVE: %m", p->incoming_dir);
+
+ return 0;
+}
+
+void bind_mount_free_many(BindMount *b, size_t n) {
+ assert(b || n == 0);
+
+ for (size_t i = 0; i < n; i++) {
+ free(b[i].source);
+ free(b[i].destination);
+ }
+
+ free(b);
+}
+
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
+ _cleanup_free_ char *s = NULL, *d = NULL;
+ BindMount *c;
+
+ assert(b);
+ assert(n);
+ assert(item);
+
+ s = strdup(item->source);
+ if (!s)
+ return -ENOMEM;
+
+ d = strdup(item->destination);
+ if (!d)
+ return -ENOMEM;
+
+ c = reallocarray(*b, *n + 1, sizeof(BindMount));
+ if (!c)
+ return -ENOMEM;
+
+ *b = c;
+
+ c[(*n) ++] = (BindMount) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .read_only = item->read_only,
+ .nosuid = item->nosuid,
+ .recursive = item->recursive,
+ .ignore_enoent = item->ignore_enoent,
+ };
+
+ return 0;
+}
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n) {
+ assert(n);
+ assert(m || *n == 0);
+
+ for (size_t i = 0; i < *n; i++) {
+ free(m[i].source);
+ free(m[i].destination);
+ mount_options_free_all(m[i].mount_options);
+ }
+
+ free(m);
+ *n = 0;
+ return NULL;
+}
+
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item) {
+ _cleanup_free_ char *s = NULL, *d = NULL;
+ _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
+ MountImage *c;
+
+ assert(m);
+ assert(n);
+ assert(item);
+
+ s = strdup(item->source);
+ if (!s)
+ return -ENOMEM;
+
+ if (item->destination) {
+ d = strdup(item->destination);
+ if (!d)
+ return -ENOMEM;
+ }
+
+ LIST_FOREACH(mount_options, i, item->mount_options) {
+ _cleanup_(mount_options_free_allp) MountOptions *o = NULL;
+
+ o = new(MountOptions, 1);
+ if (!o)
+ return -ENOMEM;
+
+ *o = (MountOptions) {
+ .partition_designator = i->partition_designator,
+ .options = strdup(i->options),
+ };
+ if (!o->options)
+ return -ENOMEM;
+
+ LIST_APPEND(mount_options, options, TAKE_PTR(o));
+ }
+
+ c = reallocarray(*m, *n + 1, sizeof(MountImage));
+ if (!c)
+ return -ENOMEM;
+
+ *m = c;
+
+ c[(*n) ++] = (MountImage) {
+ .source = TAKE_PTR(s),
+ .destination = TAKE_PTR(d),
+ .mount_options = TAKE_PTR(options),
+ .ignore_enoent = item->ignore_enoent,
+ .type = item->type,
+ };
+
+ return 0;
+}
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
+ assert(t || n == 0);
+
+ for (size_t i = 0; i < n; i++) {
+ free(t[i].path);
+ free(t[i].options);
+ }
+
+ free(t);
+}
+
+int temporary_filesystem_add(
+ TemporaryFileSystem **t,
+ size_t *n,
+ const char *path,
+ const char *options) {
+
+ _cleanup_free_ char *p = NULL, *o = NULL;
+ TemporaryFileSystem *c;
+
+ assert(t);
+ assert(n);
+ assert(path);
+
+ p = strdup(path);
+ if (!p)
+ return -ENOMEM;
+
+ if (!isempty(options)) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
+ if (!c)
+ return -ENOMEM;
+
+ *t = c;
+
+ c[(*n) ++] = (TemporaryFileSystem) {
+ .path = TAKE_PTR(p),
+ .options = TAKE_PTR(o),
+ };
+
+ return 0;
+}
+
+static int make_tmp_prefix(const char *prefix) {
+ _cleanup_free_ char *t = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ /* Don't do anything unless we know the dir is actually missing */
+ r = access(prefix, F_OK);
+ if (r >= 0)
+ return 0;
+ if (errno != ENOENT)
+ return -errno;
+
+ WITH_UMASK(000)
+ r = mkdir_parents(prefix, 0755);
+ if (r < 0)
+ return r;
+
+ r = tempfn_random(prefix, NULL, &t);
+ if (r < 0)
+ return r;
+
+ /* umask will corrupt this access mode, but that doesn't matter, we need to call chmod() anyway for
+ * the suid bit, below. */
+ fd = open_mkdir_at(AT_FDCWD, t, O_EXCL|O_CLOEXEC, 0777);
+ if (fd < 0)
+ return fd;
+
+ r = RET_NERRNO(fchmod(fd, 01777));
+ if (r < 0) {
+ (void) rmdir(t);
+ return r;
+ }
+
+ r = RET_NERRNO(rename(t, prefix));
+ if (r < 0) {
+ (void) rmdir(t);
+ return r == -EEXIST ? 0 : r; /* it's fine if someone else created the dir by now */
+ }
+
+ return 0;
+
+}
+
+static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, char **tmp_path) {
+ _cleanup_free_ char *x = NULL;
+ _cleanup_free_ char *y = NULL;
+ sd_id128_t boot_id;
+ bool rw = true;
+ int r;
+
+ assert(id);
+ assert(prefix);
+ assert(path);
+
+ /* We include the boot id in the directory so that after a
+ * reboot we can easily identify obsolete directories. */
+
+ r = sd_id128_get_boot(&boot_id);
+ if (r < 0)
+ return r;
+
+ x = strjoin(prefix, "/systemd-private-", SD_ID128_TO_STRING(boot_id), "-", id, "-XXXXXX");
+ if (!x)
+ return -ENOMEM;
+
+ r = make_tmp_prefix(prefix);
+ if (r < 0)
+ return r;
+
+ WITH_UMASK(0077)
+ if (!mkdtemp(x)) {
+ if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
+ rw = false;
+ else
+ return -errno;
+ }
+
+ if (rw) {
+ y = strjoin(x, "/tmp");
+ if (!y)
+ return -ENOMEM;
+
+ WITH_UMASK(0000)
+ if (mkdir(y, 0777 | S_ISVTX) < 0)
+ return -errno;
+
+ r = label_fix_full(AT_FDCWD, y, prefix, 0);
+ if (r < 0)
+ return r;
+
+ if (tmp_path)
+ *tmp_path = TAKE_PTR(y);
+ } else {
+ /* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
+ * read-only. This way the service will get the EROFS result as if it was writing to the real
+ * file system. */
+ WITH_UMASK(0000)
+ r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&x, RUN_SYSTEMD_EMPTY);
+ if (r < 0)
+ return r;
+ }
+
+ *path = TAKE_PTR(x);
+ return 0;
+}
+
+int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
+ _cleanup_(namespace_cleanup_tmpdirp) char *a = NULL;
+ _cleanup_(rmdir_and_freep) char *a_tmp = NULL;
+ char *b;
+ int r;
+
+ assert(id);
+ assert(tmp_dir);
+ assert(var_tmp_dir);
+
+ r = setup_one_tmp_dir(id, "/tmp", &a, &a_tmp);
+ if (r < 0)
+ return r;
+
+ r = setup_one_tmp_dir(id, "/var/tmp", &b, NULL);
+ if (r < 0)
+ return r;
+
+ a_tmp = mfree(a_tmp); /* avoid rmdir */
+ *tmp_dir = TAKE_PTR(a);
+ *var_tmp_dir = TAKE_PTR(b);
+
+ return 0;
+}
+
+int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag) {
+ _cleanup_close_ int ns = -EBADF;
+ const char *ns_name, *ns_path;
+ int r;
+
+ assert(ns_storage_socket);
+ assert(ns_storage_socket[0] >= 0);
+ assert(ns_storage_socket[1] >= 0);
+
+ ns_name = ASSERT_PTR(namespace_single_flag_to_string(nsflag));
+
+ /* We use the passed socketpair as a storage buffer for our namespace reference fd. Whatever process
+ * runs this first shall create a new namespace, all others should just join it. To serialize that we
+ * use a file lock on the socket pair.
+ *
+ * It's a bit crazy, but hey, works great! */
+
+ r = posix_lock(ns_storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return r;
+
+ CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
+
+ ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+ if (ns >= 0) {
+ /* Yay, found something, so let's join the namespace */
+ r = RET_NERRNO(setns(ns, nsflag));
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ if (ns != -EAGAIN)
+ return ns;
+
+ /* Nothing stored yet, so let's create a new namespace. */
+
+ if (unshare(nsflag) < 0)
+ return -errno;
+
+ if (nsflag == CLONE_NEWNET)
+ (void) loopback_setup();
+
+ ns_path = strjoina("/proc/self/ns/", ns_name);
+ ns = open(ns_path, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (ns < 0)
+ return -errno;
+
+ r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int open_shareable_ns_path(int ns_storage_socket[static 2], const char *path, unsigned long nsflag) {
+ _cleanup_close_ int ns = -EBADF;
+ int r;
+
+ assert(ns_storage_socket);
+ assert(ns_storage_socket[0] >= 0);
+ assert(ns_storage_socket[1] >= 0);
+ assert(path);
+
+ /* If the storage socket doesn't contain a ns fd yet, open one via the file system and store it in
+ * it. This is supposed to be called ahead of time, i.e. before setup_shareable_ns() which will
+ * allocate a new anonymous ns if needed. */
+
+ r = posix_lock(ns_storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return r;
+
+ CLEANUP_POSIX_UNLOCK(ns_storage_socket[0]);
+
+ ns = receive_one_fd(ns_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+ if (ns >= 0)
+ return 0;
+ if (ns != -EAGAIN)
+ return ns;
+
+ /* Nothing stored yet. Open the file from the file system. */
+
+ ns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
+ if (ns < 0)
+ return -errno;
+
+ r = fd_is_ns(ns, nsflag);
+ if (r == 0)
+ return -EINVAL;
+ if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
+ return r;
+
+ r = send_one_fd(ns_storage_socket[1], ns, MSG_DONTWAIT);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+bool ns_type_supported(NamespaceType type) {
+ const char *t, *ns_proc;
+
+ t = namespace_type_to_string(type);
+ if (!t) /* Don't know how to translate this? Then it's not supported */
+ return false;
+
+ ns_proc = strjoina("/proc/self/ns/", t);
+ return access(ns_proc, F_OK) == 0;
+}
+
+static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
+ [PROTECT_HOME_NO] = "no",
+ [PROTECT_HOME_YES] = "yes",
+ [PROTECT_HOME_READ_ONLY] = "read-only",
+ [PROTECT_HOME_TMPFS] = "tmpfs",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
+
+static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
+ [PROTECT_SYSTEM_NO] = "no",
+ [PROTECT_SYSTEM_YES] = "yes",
+ [PROTECT_SYSTEM_FULL] = "full",
+ [PROTECT_SYSTEM_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
+
+static const char* const namespace_type_table[] = {
+ [NAMESPACE_MOUNT] = "mnt",
+ [NAMESPACE_CGROUP] = "cgroup",
+ [NAMESPACE_UTS] = "uts",
+ [NAMESPACE_IPC] = "ipc",
+ [NAMESPACE_USER] = "user",
+ [NAMESPACE_PID] = "pid",
+ [NAMESPACE_NET] = "net",
+ [NAMESPACE_TIME] = "time",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
+
+static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
+ [PROTECT_PROC_DEFAULT] = "default",
+ [PROTECT_PROC_NOACCESS] = "noaccess",
+ [PROTECT_PROC_INVISIBLE] = "invisible",
+ [PROTECT_PROC_PTRACEABLE] = "ptraceable",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
+
+static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
+ [PROC_SUBSET_ALL] = "all",
+ [PROC_SUBSET_PID] = "pid",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
diff --git a/src/core/namespace.h b/src/core/namespace.h
new file mode 100644
index 0000000..921716b
--- /dev/null
+++ b/src/core/namespace.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2016 Djalal Harouni
+***/
+
+typedef struct NamespaceParameters NamespaceParameters;
+typedef struct BindMount BindMount;
+typedef struct TemporaryFileSystem TemporaryFileSystem;
+typedef struct MountImage MountImage;
+
+#include <stdbool.h>
+
+#include "dissect-image.h"
+#include "fs-util.h"
+#include "macro.h"
+#include "namespace-util.h"
+#include "runtime-scope.h"
+#include "string-util.h"
+
+typedef enum ProtectHome {
+ PROTECT_HOME_NO,
+ PROTECT_HOME_YES,
+ PROTECT_HOME_READ_ONLY,
+ PROTECT_HOME_TMPFS,
+ _PROTECT_HOME_MAX,
+ _PROTECT_HOME_INVALID = -EINVAL,
+} ProtectHome;
+
+typedef enum ProtectSystem {
+ PROTECT_SYSTEM_NO,
+ PROTECT_SYSTEM_YES,
+ PROTECT_SYSTEM_FULL,
+ PROTECT_SYSTEM_STRICT,
+ _PROTECT_SYSTEM_MAX,
+ _PROTECT_SYSTEM_INVALID = -EINVAL,
+} ProtectSystem;
+
+typedef enum ProtectProc {
+ PROTECT_PROC_DEFAULT,
+ PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
+ PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
+ PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
+ _PROTECT_PROC_MAX,
+ _PROTECT_PROC_INVALID = -EINVAL,
+} ProtectProc;
+
+typedef enum ProcSubset {
+ PROC_SUBSET_ALL,
+ PROC_SUBSET_PID, /* subset=pid */
+ _PROC_SUBSET_MAX,
+ _PROC_SUBSET_INVALID = -EINVAL,
+} ProcSubset;
+
+struct BindMount {
+ char *source;
+ char *destination;
+ bool read_only;
+ bool nosuid;
+ bool recursive;
+ bool ignore_enoent;
+};
+
+struct TemporaryFileSystem {
+ char *path;
+ char *options;
+};
+
+typedef enum MountImageType {
+ MOUNT_IMAGE_DISCRETE,
+ MOUNT_IMAGE_EXTENSION,
+ _MOUNT_IMAGE_TYPE_MAX,
+ _MOUNT_IMAGE_TYPE_INVALID = -EINVAL,
+} MountImageType;
+
+struct MountImage {
+ char *source;
+ char *destination; /* Unused if MountImageType == MOUNT_IMAGE_EXTENSION */
+ LIST_HEAD(MountOptions, mount_options);
+ bool ignore_enoent;
+ MountImageType type;
+};
+
+struct NamespaceParameters {
+ RuntimeScope runtime_scope;
+
+ const char *root_directory;
+ const char *root_image;
+ const MountOptions *root_image_options;
+ const ImagePolicy *root_image_policy;
+
+ char **read_write_paths;
+ char **read_only_paths;
+ char **inaccessible_paths;
+
+ char **exec_paths;
+ char **no_exec_paths;
+
+ char **empty_directories;
+ char **symlinks;
+
+ const BindMount *bind_mounts;
+ size_t n_bind_mounts;
+
+ const TemporaryFileSystem *temporary_filesystems;
+ size_t n_temporary_filesystems;
+
+ const MountImage *mount_images;
+ size_t n_mount_images;
+ const ImagePolicy *mount_image_policy;
+
+ const char *tmp_dir;
+ const char *var_tmp_dir;
+
+ const char *creds_path;
+ const char *log_namespace;
+
+ unsigned long mount_propagation_flag;
+ VeritySettings *verity;
+
+ const MountImage *extension_images;
+ size_t n_extension_images;
+ const ImagePolicy *extension_image_policy;
+ char **extension_directories;
+
+ const char *propagate_dir;
+ const char *incoming_dir;
+
+ const char *extension_dir;
+ const char *notify_socket;
+ const char *host_os_release_stage;
+
+ bool ignore_protect_paths;
+
+ bool protect_control_groups;
+ bool protect_kernel_tunables;
+ bool protect_kernel_modules;
+ bool protect_kernel_logs;
+ bool protect_hostname;
+
+ bool private_dev;
+ bool private_network;
+ bool private_ipc;
+
+ bool mount_apivfs;
+ bool mount_nosuid;
+
+ ProtectHome protect_home;
+ ProtectSystem protect_system;
+ ProtectProc protect_proc;
+ ProcSubset proc_subset;
+};
+
+int setup_namespace(const NamespaceParameters *p, char **error_path);
+
+#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
+
+static inline char* namespace_cleanup_tmpdir(char *p) {
+ PROTECT_ERRNO;
+ if (!streq_ptr(p, RUN_SYSTEMD_EMPTY))
+ (void) rmdir(p);
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, namespace_cleanup_tmpdir);
+
+int setup_tmp_dirs(
+ const char *id,
+ char **tmp_dir,
+ char **var_tmp_dir);
+
+int setup_shareable_ns(int ns_storage_socket[static 2], unsigned long nsflag);
+int open_shareable_ns_path(int netns_storage_socket[static 2], const char *path, unsigned long nsflag);
+
+const char* protect_home_to_string(ProtectHome p) _const_;
+ProtectHome protect_home_from_string(const char *s) _pure_;
+
+const char* protect_system_to_string(ProtectSystem p) _const_;
+ProtectSystem protect_system_from_string(const char *s) _pure_;
+
+const char* protect_proc_to_string(ProtectProc i) _const_;
+ProtectProc protect_proc_from_string(const char *s) _pure_;
+
+const char* proc_subset_to_string(ProcSubset i) _const_;
+ProcSubset proc_subset_from_string(const char *s) _pure_;
+
+void bind_mount_free_many(BindMount *b, size_t n);
+int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
+
+void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n);
+int temporary_filesystem_add(TemporaryFileSystem **t, size_t *n,
+ const char *path, const char *options);
+
+MountImage* mount_image_free_many(MountImage *m, size_t *n);
+int mount_image_add(MountImage **m, size_t *n, const MountImage *item);
+
+const char* namespace_type_to_string(NamespaceType t) _const_;
+NamespaceType namespace_type_from_string(const char *s) _pure_;
+
+bool ns_type_supported(NamespaceType type);
diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf
new file mode 100644
index 0000000..52034e0
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.conf
@@ -0,0 +1,452 @@
+<?xml version="1.0"?> <!--*-nxml-*-->
+<!DOCTYPE busconfig PUBLIC "-//freedesktop//DTD D-BUS Bus Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/dbus/1.0/busconfig.dtd">
+
+<!--
+ SPDX-License-Identifier: LGPL-2.1-or-later
+
+ This file is part of systemd.
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+-->
+
+<busconfig>
+
+ <policy user="root">
+ <allow own="org.freedesktop.systemd1"/>
+
+ <!-- Root clients can do everything -->
+ <allow send_destination="org.freedesktop.systemd1"/>
+ <allow receive_sender="org.freedesktop.systemd1"/>
+
+ <!-- systemd may receive activator requests -->
+ <allow receive_interface="org.freedesktop.systemd1.Activator"
+ receive_member="ActivationRequest"/>
+ </policy>
+
+ <policy context="default">
+ <deny send_destination="org.freedesktop.systemd1"/>
+
+ <!-- Completely open to anyone: org.freedesktop.DBus.* interfaces -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Introspectable"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Peer"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="Get"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.DBus.Properties"
+ send_member="GetAll"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Manager interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByPID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByInvocationID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByControlGroup"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitByPIDFD"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LoadUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitProcesses"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJob"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJobAfter"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetJobBefore"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnits"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsFiltered"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsByPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitsByNames"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListJobs"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Subscribe"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Unsubscribe"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Dump"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpByFileDescriptor"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpUnitsMatchingPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DumpUnitsMatchingPatternsByFileDescriptor"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ListUnitFilesByPatterns"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitFileState"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetDefaultTarget"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetUnitFileLinks"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LookupDynamicUserByName"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LookupDynamicUserByUID"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="GetDynamicUsers"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Unit interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Slice interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Slice"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Scope interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Scope"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Socket interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Socket"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Mount interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Mount"
+ send_member="GetProcesses"/>
+
+ <!-- Completely open to anyone: org.freedesktop.systemd1.Swap interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Swap"
+ send_member="GetProcesses"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Manager interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartUnitReplace"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StopUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="TryRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadOrRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReloadOrTryRestartUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="BindMountUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="MountImageUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="KillUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="QueueSignalUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ResetFailedUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetUnitProperties"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RefUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="UnrefUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="StartTransientUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="AttachProcessesToUnit"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="CancelJob"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ClearJobs"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ResetFailed"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Reload"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="Reexecute"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="EnableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="EnableUnitFilesWithFlags"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DisableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DisableUnitFilesWithFlags"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="DisableUnitFilesWithFlagsAndInstallInfo"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="ReenableUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="LinkUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetUnitFilesWithMode"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="MaskUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="UnmaskUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="RevertUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetDefaultTarget"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="PresetAllUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="AddDependencyUnitFiles"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Manager"
+ send_member="SetShowStatus"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Job interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="Cancel"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="GetAfter"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Job"
+ send_member="GetBefore"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Unit interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Start"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Stop"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Reload"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Restart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="TryRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ReloadOrRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ReloadOrTryRestart"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Kill"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="QueueSignal"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="ResetFailed"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="SetProperties"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Ref"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Unit"
+ send_member="Unref"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Service interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="AttachProcesses"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="BindMount"/>
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Service"
+ send_member="MountImage"/>
+
+ <!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Scope interface -->
+
+ <allow send_destination="org.freedesktop.systemd1"
+ send_interface="org.freedesktop.systemd1.Scope"
+ send_member="AttachProcesses"/>
+
+ <allow receive_sender="org.freedesktop.systemd1"/>
+ </policy>
+
+</busconfig>
diff --git a/src/core/org.freedesktop.systemd1.policy.in b/src/core/org.freedesktop.systemd1.policy.in
new file mode 100644
index 0000000..0083e0b
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.policy.in
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*-->
+<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd">
+
+<!--
+ SPDX-License-Identifier: LGPL-2.1-or-later
+
+ This file is part of systemd.
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+-->
+
+<policyconfig>
+
+ <vendor>The systemd Project</vendor>
+ <vendor_url>https://systemd.io</vendor_url>
+
+ <action id="org.freedesktop.systemd1.reply-password">
+ <description gettext-domain="systemd">Send passphrase back to system</description>
+ <message gettext-domain="systemd">Authentication is required to send the entered passphrase back to the system.</message>
+ <defaults>
+ <allow_any>no</allow_any>
+ <allow_inactive>no</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.exec.path">{{LIBEXECDIR}}/systemd-reply-password</annotate>
+ </action>
+
+ <action id="org.freedesktop.systemd1.manage-units">
+ <description gettext-domain="systemd">Manage system services or other units</description>
+ <message gettext-domain="systemd">Authentication is required to manage system services or other units.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.manage-unit-files">
+ <description gettext-domain="systemd">Manage system service or unit files</description>
+ <message gettext-domain="systemd">Authentication is required to manage system service or unit files.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.imply">org.freedesktop.systemd1.reload-daemon org.freedesktop.systemd1.manage-units</annotate>
+ </action>
+
+ <action id="org.freedesktop.systemd1.set-environment">
+ <description gettext-domain="systemd">Set or unset system and service manager environment variables</description>
+ <message gettext-domain="systemd">Authentication is required to set or unset system and service manager environment variables.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.reload-daemon">
+ <description gettext-domain="systemd">Reload the systemd state</description>
+ <message gettext-domain="systemd">Authentication is required to reload the systemd state.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+ <action id="org.freedesktop.systemd1.bypass-dump-ratelimit">
+ <description gettext-domain="systemd">Dump the systemd state without rate limits</description>
+ <message gettext-domain="systemd">Authentication is required to dump the systemd state without rate limits.</message>
+ <defaults>
+ <allow_any>auth_admin</allow_any>
+ <allow_inactive>auth_admin</allow_inactive>
+ <allow_active>auth_admin_keep</allow_active>
+ </defaults>
+ </action>
+
+</policyconfig>
diff --git a/src/core/org.freedesktop.systemd1.service b/src/core/org.freedesktop.systemd1.service
new file mode 100644
index 0000000..082125f
--- /dev/null
+++ b/src/core/org.freedesktop.systemd1.service
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.systemd1
+Exec=/bin/false
+User=root
diff --git a/src/core/path.c b/src/core/path.c
new file mode 100644
index 0000000..ef00c20
--- /dev/null
+++ b/src/core/path.c
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <unistd.h>
+
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-path.h"
+#include "dbus-unit.h"
+#include "escape.h"
+#include "event-util.h"
+#include "fd-util.h"
+#include "glob-util.h"
+#include "inotify-util.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path.h"
+#include "path-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_PATH_STATE_MAX] = {
+ [PATH_DEAD] = UNIT_INACTIVE,
+ [PATH_WAITING] = UNIT_ACTIVE,
+ [PATH_RUNNING] = UNIT_ACTIVE,
+ [PATH_FAILED] = UNIT_FAILED,
+};
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler) {
+ static const int flags_table[_PATH_TYPE_MAX] = {
+ [PATH_EXISTS] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+ [PATH_EXISTS_GLOB] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB,
+ [PATH_CHANGED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO,
+ [PATH_MODIFIED] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_MOVED_FROM|IN_MOVED_TO|IN_MODIFY,
+ [PATH_DIRECTORY_NOT_EMPTY] = IN_DELETE_SELF|IN_MOVE_SELF|IN_ATTRIB|IN_CREATE|IN_MOVED_TO,
+ };
+
+ bool exists = false;
+ char *slash, *oldslash = NULL;
+ int r;
+
+ assert(s);
+ assert(s->unit);
+ assert(handler);
+
+ path_spec_unwatch(s);
+
+ s->inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
+ if (s->inotify_fd < 0) {
+ r = log_error_errno(errno, "Failed to allocate inotify fd: %m");
+ goto fail;
+ }
+
+ r = sd_event_add_io(s->unit->manager->event, &s->event_source, s->inotify_fd, EPOLLIN, handler, s);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add inotify fd to event loop: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(s->event_source, "path");
+
+ /* This function assumes the path was passed through path_simplify()! */
+ assert(!strstr(s->path, "//"));
+
+ for (slash = strchr(s->path, '/'); ; slash = strchr(slash+1, '/')) {
+ bool incomplete = false;
+ int flags, wd = -1;
+ char tmp, *cut;
+
+ if (slash) {
+ cut = slash + (slash == s->path);
+ tmp = *cut;
+ *cut = '\0';
+
+ flags = IN_MOVE_SELF | IN_DELETE_SELF | IN_ATTRIB | IN_CREATE | IN_MOVED_TO;
+ } else {
+ cut = NULL;
+ flags = flags_table[s->type];
+ }
+
+ /* If this is a symlink watch both the symlink inode and where it points to. If the inode is
+ * not a symlink both calls will install the same watch, which is redundant and doesn't
+ * hurt. */
+ for (int follow_symlink = 0; follow_symlink < 2; follow_symlink ++) {
+ uint32_t f = flags;
+
+ SET_FLAG(f, IN_DONT_FOLLOW, !follow_symlink);
+
+ wd = inotify_add_watch(s->inotify_fd, s->path, f);
+ if (wd < 0) {
+ if (IN_SET(errno, EACCES, ENOENT)) {
+ incomplete = true; /* This is an expected error, let's accept this
+ * quietly: we have an incomplete watch for
+ * now. */
+ break;
+ }
+
+ /* This second call to inotify_add_watch() should fail like the previous one
+ * and is done for logging the error in a comprehensive way. */
+ wd = inotify_add_watch_and_warn(s->inotify_fd, s->path, f);
+ if (wd < 0) {
+ if (cut)
+ *cut = tmp;
+
+ r = wd;
+ goto fail;
+ }
+
+ /* Hmm, we succeeded in adding the watch this time... let's continue. */
+ }
+ }
+
+ if (incomplete) {
+ if (cut)
+ *cut = tmp;
+
+ break;
+ }
+
+ exists = true;
+
+ /* Path exists, we don't need to watch parent too closely. */
+ if (oldslash) {
+ char *cut2 = oldslash + (oldslash == s->path);
+ char tmp2 = *cut2;
+ *cut2 = '\0';
+
+ (void) inotify_add_watch(s->inotify_fd, s->path, IN_MOVE_SELF);
+ /* Error is ignored, the worst can happen is we get spurious events. */
+
+ *cut2 = tmp2;
+ }
+
+ if (cut)
+ *cut = tmp;
+
+ if (slash)
+ oldslash = slash;
+ else {
+ /* whole path has been iterated over */
+ s->primary_wd = wd;
+ break;
+ }
+ }
+
+ if (!exists) {
+ r = log_error_errno(errno, "Failed to add watch on any of the components of %s: %m", s->path);
+ /* either EACCESS or ENOENT */
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ path_spec_unwatch(s);
+ return r;
+}
+
+void path_spec_unwatch(PathSpec *s) {
+ assert(s);
+
+ s->event_source = sd_event_source_disable_unref(s->event_source);
+ s->inotify_fd = safe_close(s->inotify_fd);
+}
+
+int path_spec_fd_event(PathSpec *s, uint32_t revents) {
+ union inotify_event_buffer buffer;
+ ssize_t l;
+
+ assert(s);
+
+ if (revents != EPOLLIN)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Got invalid poll event on inotify.");
+
+ l = read(s->inotify_fd, &buffer, sizeof(buffer));
+ if (l < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ return 0;
+
+ return log_error_errno(errno, "Failed to read inotify event: %m");
+ }
+
+ if (IN_SET(s->type, PATH_CHANGED, PATH_MODIFIED))
+ FOREACH_INOTIFY_EVENT_WARN(e, buffer, l)
+ if (s->primary_wd == e->wd)
+ return 1;
+
+ return 0;
+}
+
+static bool path_spec_check_good(PathSpec *s, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+ _cleanup_free_ char *trigger = NULL;
+ bool b, good = false;
+
+ assert(s);
+ assert(ret_trigger_path);
+
+ switch (s->type) {
+
+ case PATH_EXISTS:
+ good = access(s->path, F_OK) >= 0;
+ break;
+
+ case PATH_EXISTS_GLOB:
+ good = glob_first(s->path, &trigger) > 0;
+ break;
+
+ case PATH_DIRECTORY_NOT_EMPTY: {
+ int k;
+
+ k = dir_is_empty(s->path, /* ignore_hidden_or_backup= */ true);
+ good = !(IN_SET(k, -ENOENT, -ENOTDIR) || k > 0);
+ break;
+ }
+
+ case PATH_CHANGED:
+ case PATH_MODIFIED:
+ b = access(s->path, F_OK) >= 0;
+ good = !initial && !from_trigger_notify && b != s->previous_exists;
+ s->previous_exists = b;
+ break;
+
+ default:
+ ;
+ }
+
+ if (good) {
+ if (!trigger) {
+ trigger = strdup(s->path);
+ if (!trigger)
+ (void) log_oom_debug();
+ }
+ *ret_trigger_path = TAKE_PTR(trigger);
+ }
+
+ return good;
+}
+
+static void path_spec_mkdir(PathSpec *s, mode_t mode) {
+ int r;
+
+ if (IN_SET(s->type, PATH_EXISTS, PATH_EXISTS_GLOB))
+ return;
+
+ r = mkdir_p_label(s->path, mode);
+ if (r < 0)
+ log_warning_errno(r, "mkdir(%s) failed: %m", s->path);
+}
+
+static void path_spec_dump(PathSpec *s, FILE *f, const char *prefix) {
+ const char *type;
+
+ assert_se(type = path_type_to_string(s->type));
+ fprintf(f, "%s%s: %s\n", prefix, type, s->path);
+}
+
+void path_spec_done(PathSpec *s) {
+ assert(s);
+ assert(s->inotify_fd == -EBADF);
+
+ free(s->path);
+}
+
+static void path_init(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ p->directory_mode = 0755;
+
+ p->trigger_limit = RATELIMIT_OFF;
+}
+
+void path_free_specs(Path *p) {
+ PathSpec *s;
+
+ assert(p);
+
+ while ((s = LIST_POP(spec, p->specs))) {
+ path_spec_unwatch(s);
+ path_spec_done(s);
+ free(s);
+ }
+}
+
+static void path_done(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+
+ p->trigger_notify_event_source = sd_event_source_disable_unref(p->trigger_notify_event_source);
+ path_free_specs(p);
+}
+
+static int path_add_mount_dependencies(Path *p) {
+ int r;
+
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs) {
+ r = unit_require_mounts_for(UNIT(p), s->path, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int path_verify(Path *p) {
+ assert(p);
+ assert(UNIT(p)->load_state == UNIT_LOADED);
+
+ if (!p->specs)
+ return log_unit_error_errno(UNIT(p), SYNTHETIC_ERRNO(ENOEXEC), "Path unit lacks path setting. Refusing.");
+
+ return 0;
+}
+
+static int path_add_default_dependencies(Path *p) {
+ int r;
+
+ assert(p);
+
+ if (!UNIT(p)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(p), UNIT_BEFORE, SPECIAL_PATHS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(p)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(p), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(p), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int path_add_trigger_dependencies(Path *p) {
+ Unit *x;
+ int r;
+
+ assert(p);
+
+ if (UNIT_TRIGGER(UNIT(p)))
+ return 0;
+
+ r = unit_load_related_unit(UNIT(p), ".service", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(p), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int path_add_extras(Path *p) {
+ int r;
+
+ assert(p);
+
+ /* To avoid getting pid1 in a busy-loop state (eg: unmet condition on associated service),
+ * set a default trigger limit if the user didn't specify any. */
+ if (p->trigger_limit.interval == USEC_INFINITY)
+ p->trigger_limit.interval = 2 * USEC_PER_SEC;
+
+ if (p->trigger_limit.burst == UINT_MAX)
+ p->trigger_limit.burst = 200;
+
+ r = path_add_trigger_dependencies(p);
+ if (r < 0)
+ return r;
+
+ r = path_add_mount_dependencies(p);
+ if (r < 0)
+ return r;
+
+ return path_add_default_dependencies(p);
+}
+
+static int path_load(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = path_add_extras(p);
+ if (r < 0)
+ return r;
+
+ return path_verify(p);
+}
+
+static void path_dump(Unit *u, FILE *f, const char *prefix) {
+ Path *p = PATH(u);
+ Unit *trigger;
+
+ assert(p);
+ assert(f);
+
+ trigger = UNIT_TRIGGER(u);
+
+ fprintf(f,
+ "%sPath State: %s\n"
+ "%sResult: %s\n"
+ "%sUnit: %s\n"
+ "%sMakeDirectory: %s\n"
+ "%sDirectoryMode: %04o\n"
+ "%sTriggerLimitIntervalSec: %s\n"
+ "%sTriggerLimitBurst: %u\n",
+ prefix, path_state_to_string(p->state),
+ prefix, path_result_to_string(p->result),
+ prefix, trigger ? trigger->id : "n/a",
+ prefix, yes_no(p->make_directory),
+ prefix, p->directory_mode,
+ prefix, FORMAT_TIMESPAN(p->trigger_limit.interval, USEC_PER_SEC),
+ prefix, p->trigger_limit.burst);
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_dump(s, f, prefix);
+}
+
+static void path_unwatch(Path *p) {
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_unwatch(s);
+}
+
+static int path_watch(Path *p) {
+ int r;
+
+ assert(p);
+
+ LIST_FOREACH(spec, s, p->specs) {
+ r = path_spec_watch(s, path_dispatch_io);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void path_set_state(Path *p, PathState state) {
+ PathState old_state;
+ assert(p);
+
+ if (p->state != state)
+ bus_unit_send_pending_change_signal(UNIT(p), false);
+
+ old_state = p->state;
+ p->state = state;
+
+ if (!IN_SET(state, PATH_WAITING, PATH_RUNNING))
+ path_unwatch(p);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(p), "Changed %s -> %s", path_state_to_string(old_state), path_state_to_string(state));
+
+ unit_notify(UNIT(p), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify);
+
+static int path_coldplug(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(p->state == PATH_DEAD);
+
+ if (p->deserialized_state != p->state) {
+
+ if (IN_SET(p->deserialized_state, PATH_WAITING, PATH_RUNNING))
+ path_enter_waiting(p, true, false);
+ else
+ path_set_state(p, p->deserialized_state);
+ }
+
+ return 0;
+}
+
+static void path_enter_dead(Path *p, PathResult f) {
+ assert(p);
+
+ if (p->result == PATH_SUCCESS)
+ p->result = f;
+
+ unit_log_result(UNIT(p), p->result == PATH_SUCCESS, path_result_to_string(p->result));
+ path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD);
+}
+
+static void path_enter_running(Path *p, char *trigger_path) {
+ _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ Job *job;
+ int r;
+
+ assert(p);
+
+ /* Don't start job if we are supposed to go down */
+ if (unit_stop_pending(UNIT(p)))
+ return;
+
+ if (!ratelimit_below(&p->trigger_limit)) {
+ log_unit_warning(UNIT(p), "Trigger limit hit, refusing further activation.");
+ path_enter_dead(p, PATH_FAILURE_TRIGGER_LIMIT_HIT);
+ return;
+ }
+
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (!trigger) {
+ log_unit_error(UNIT(p), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ details = activation_details_new(UNIT(p));
+ if (!details) {
+ log_oom();
+ goto fail;
+ }
+
+ r = free_and_strdup(&(ACTIVATION_DETAILS_PATH(details))->trigger_path_filename, trigger_path);
+ if (r < 0) {
+ log_oom();
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(p)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0) {
+ log_unit_warning(UNIT(p), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+
+ job_set_activation_details(job, details);
+
+ path_set_state(p, PATH_RUNNING);
+ path_unwatch(p);
+
+ return;
+
+fail:
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+}
+
+static bool path_check_good(Path *p, bool initial, bool from_trigger_notify, char **ret_trigger_path) {
+ assert(p);
+ assert(ret_trigger_path);
+
+ LIST_FOREACH(spec, s, p->specs)
+ if (path_spec_check_good(s, initial, from_trigger_notify, ret_trigger_path))
+ return true;
+
+ return false;
+}
+
+static void path_enter_waiting(Path *p, bool initial, bool from_trigger_notify) {
+ _cleanup_free_ char *trigger_path = NULL;
+ Unit *trigger;
+ int r;
+
+ if (p->trigger_notify_event_source)
+ (void) event_source_disable(p->trigger_notify_event_source);
+
+ /* If the triggered unit is already running, so are we */
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (trigger && !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(trigger))) {
+ path_set_state(p, PATH_RUNNING);
+ path_unwatch(p);
+ return;
+ }
+
+ if (path_check_good(p, initial, from_trigger_notify, &trigger_path)) {
+ log_unit_debug(UNIT(p), "Got triggered.");
+ path_enter_running(p, trigger_path);
+ return;
+ }
+
+ r = path_watch(p);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(p), r, "Failed to enter waiting state: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+
+ /* Hmm, so now we have created inotify watches, but the file
+ * might have appeared/been removed by now, so we must
+ * recheck */
+
+ if (path_check_good(p, false, from_trigger_notify, &trigger_path)) {
+ log_unit_debug(UNIT(p), "Got triggered.");
+ path_enter_running(p, trigger_path);
+ return;
+ }
+
+ path_set_state(p, PATH_WAITING);
+}
+
+static void path_mkdir(Path *p) {
+ assert(p);
+
+ if (!p->make_directory)
+ return;
+
+ LIST_FOREACH(spec, s, p->specs)
+ path_spec_mkdir(s, p->directory_mode);
+}
+
+static int path_start(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(p);
+ assert(IN_SET(p->state, PATH_DEAD, PATH_FAILED));
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ path_mkdir(p);
+
+ p->result = PATH_SUCCESS;
+ path_enter_waiting(p, true, false);
+
+ return 1;
+}
+
+static int path_stop(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+ assert(IN_SET(p->state, PATH_WAITING, PATH_RUNNING));
+
+ path_enter_dead(p, PATH_SUCCESS);
+ return 1;
+}
+
+static int path_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", path_state_to_string(p->state));
+ (void) serialize_item(f, "result", path_result_to_string(p->result));
+
+ LIST_FOREACH(spec, s, p->specs) {
+ const char *type;
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape(s->path);
+ if (!escaped)
+ return log_oom();
+
+ assert_se(type = path_type_to_string(s->type));
+ (void) serialize_item_format(f, "path-spec", "%s %i %s",
+ type,
+ s->previous_exists,
+ escaped);
+ }
+
+ (void) serialize_ratelimit(f, "trigger-ratelimit", &p->trigger_limit);
+
+ return 0;
+}
+
+static int path_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Path *p = PATH(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ PathState state;
+
+ state = path_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ p->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ PathResult f;
+
+ f = path_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != PATH_SUCCESS)
+ p->result = f;
+
+ } else if (streq(key, "path-spec")) {
+ int previous_exists, skip = 0;
+ _cleanup_free_ char *type_str = NULL;
+
+ if (sscanf(value, "%ms %i %n", &type_str, &previous_exists, &skip) < 2)
+ log_unit_debug(u, "Failed to parse path-spec value: %s", value);
+ else {
+ _cleanup_free_ char *unescaped = NULL;
+ ssize_t l;
+ PathType type;
+
+ type = path_type_from_string(type_str);
+ if (type < 0) {
+ log_unit_warning(u, "Unknown path type \"%s\", ignoring.", type_str);
+ return 0;
+ }
+
+ l = cunescape(value+skip, 0, &unescaped);
+ if (l < 0) {
+ log_unit_warning_errno(u, l, "Failed to unescape serialize path: %m");
+ return 0;
+ }
+
+ LIST_FOREACH(spec, s, p->specs)
+ if (s->type == type &&
+ path_equal(s->path, unescaped)) {
+
+ s->previous_exists = previous_exists;
+ break;
+ }
+ }
+
+ } else if (streq(key, "trigger-ratelimit"))
+ deserialize_ratelimit(&p->trigger_limit, key, value);
+
+ else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState path_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[PATH(u)->state];
+}
+
+static const char *path_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return path_state_to_string(PATH(u)->state);
+}
+
+static int path_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ PathSpec *s = userdata, *found = NULL;
+ Path *p;
+ int changed;
+
+ assert(s);
+ assert(s->unit);
+ assert(fd >= 0);
+
+ p = PATH(s->unit);
+
+ if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+ return 0;
+
+ LIST_FOREACH(spec, i, p->specs)
+ if (path_spec_owns_inotify_fd(i, fd)) {
+ found = i;
+ break;
+ }
+
+ if (!found) {
+ log_error("Got event on unknown fd.");
+ goto fail;
+ }
+
+ changed = path_spec_fd_event(found, revents);
+ if (changed < 0)
+ goto fail;
+
+ if (changed)
+ path_enter_running(p, found->path);
+ else
+ path_enter_waiting(p, false, false);
+
+ return 0;
+
+fail:
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer);
+
+static int path_trigger_notify_on_defer(sd_event_source *s, void *userdata) {
+ Path *p = ASSERT_PTR(userdata);
+ Unit *trigger;
+
+ assert(s);
+
+ trigger = UNIT_TRIGGER(UNIT(p));
+ if (!trigger) {
+ log_unit_error(UNIT(p), "Unit to trigger vanished.");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return 0;
+ }
+
+ path_trigger_notify_impl(UNIT(p), trigger, /* on_defer = */ true);
+ return 0;
+}
+
+static void path_trigger_notify_impl(Unit *u, Unit *other, bool on_defer) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(u);
+ assert(other);
+
+ /* Invoked whenever the unit we trigger changes state or gains or loses a job */
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+ /* Don't propagate state changes from the triggered unit if we are already down */
+ if (!IN_SET(p->state, PATH_WAITING, PATH_RUNNING))
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ path_enter_dead(p, PATH_FAILURE_UNIT_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ if (p->state == PATH_RUNNING &&
+ UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) {
+ if (!on_defer)
+ log_unit_debug(u, "Got notified about unit deactivation.");
+ } else if (p->state == PATH_WAITING &&
+ !UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other))) {
+ if (!on_defer)
+ log_unit_debug(u, "Got notified about unit activation.");
+ } else
+ return;
+
+ if (on_defer) {
+ path_enter_waiting(p, /* initial = */ false, /* from_trigger_notify = */ true);
+ return;
+ }
+
+ /* Do not call path_enter_waiting() directly from path_trigger_notify(), as this may be called by
+ * job_install() -> job_finish_and_invalidate() -> unit_trigger_notify(), and path_enter_waiting()
+ * may install another job and will trigger assertion in job_install().
+ * https://github.com/systemd/systemd/issues/24577#issuecomment-1522628906
+ * Hence, first setup defer event source here, and call path_enter_waiting() slightly later. */
+ if (p->trigger_notify_event_source) {
+ r = sd_event_source_set_enabled(p->trigger_notify_event_source, SD_EVENT_ONESHOT);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to enable event source for triggering notify: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+ } else {
+ r = sd_event_add_defer(u->manager->event, &p->trigger_notify_event_source, path_trigger_notify_on_defer, p);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to allocate event source for triggering notify: %m");
+ path_enter_dead(p, PATH_FAILURE_RESOURCES);
+ return;
+ }
+
+ (void) sd_event_source_set_description(p->trigger_notify_event_source, "path-trigger-notify");
+ }
+}
+
+static void path_trigger_notify(Unit *u, Unit *other) {
+ path_trigger_notify_impl(u, other, /* on_defer = */ false);
+}
+
+static void path_reset_failed(Unit *u) {
+ Path *p = PATH(u);
+
+ assert(p);
+
+ if (p->state == PATH_FAILED)
+ path_set_state(p, PATH_DEAD);
+
+ p->result = PATH_SUCCESS;
+}
+
+static int path_can_start(Unit *u) {
+ Path *p = PATH(u);
+ int r;
+
+ assert(p);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ path_enter_dead(p, PATH_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static void activation_details_path_done(ActivationDetails *details) {
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+ p->trigger_path_filename = mfree(p->trigger_path_filename);
+}
+
+static void activation_details_path_serialize(ActivationDetails *details, FILE *f) {
+ ActivationDetailsPath *p = ASSERT_PTR(ACTIVATION_DETAILS_PATH(details));
+
+ assert(f);
+
+ if (p->trigger_path_filename)
+ (void) serialize_item(f, "activation-details-path-filename", p->trigger_path_filename);
+}
+
+static int activation_details_path_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ int r;
+
+ assert(key);
+ assert(value);
+
+ if (!details || !*details)
+ return -EINVAL;
+
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(*details);
+ if (!p)
+ return -EINVAL;
+
+ if (!streq(key, "activation-details-path-filename"))
+ return -EINVAL;
+
+ r = free_and_strdup(&p->trigger_path_filename, value);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int activation_details_path_append_env(ActivationDetails *details, char ***strv) {
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ char *s;
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(p);
+
+ if (isempty(p->trigger_path_filename))
+ return 0;
+
+ s = strjoin("TRIGGER_PATH=", p->trigger_path_filename);
+ if (!s)
+ return -ENOMEM;
+
+ r = strv_consume(strv, TAKE_PTR(s));
+ if (r < 0)
+ return r;
+
+ return 1; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_path_append_pair(ActivationDetails *details, char ***strv) {
+ ActivationDetailsPath *p = ACTIVATION_DETAILS_PATH(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(p);
+
+ if (isempty(p->trigger_path_filename))
+ return 0;
+
+ r = strv_extend(strv, "trigger_path");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, p->trigger_path_filename);
+ if (r < 0)
+ return r;
+
+ return 1; /* Return the number of pairs added to the env block */
+}
+
+static const char* const path_type_table[_PATH_TYPE_MAX] = {
+ [PATH_EXISTS] = "PathExists",
+ [PATH_EXISTS_GLOB] = "PathExistsGlob",
+ [PATH_DIRECTORY_NOT_EMPTY] = "DirectoryNotEmpty",
+ [PATH_CHANGED] = "PathChanged",
+ [PATH_MODIFIED] = "PathModified",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_type, PathType);
+
+static const char* const path_result_table[_PATH_RESULT_MAX] = {
+ [PATH_SUCCESS] = "success",
+ [PATH_FAILURE_RESOURCES] = "resources",
+ [PATH_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [PATH_FAILURE_UNIT_START_LIMIT_HIT] = "unit-start-limit-hit",
+ [PATH_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(path_result, PathResult);
+
+const UnitVTable path_vtable = {
+ .object_size = sizeof(Path),
+
+ .sections =
+ "Unit\0"
+ "Path\0"
+ "Install\0",
+ .private_section = "Path",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+
+ .init = path_init,
+ .done = path_done,
+ .load = path_load,
+
+ .coldplug = path_coldplug,
+
+ .dump = path_dump,
+
+ .start = path_start,
+ .stop = path_stop,
+
+ .serialize = path_serialize,
+ .deserialize_item = path_deserialize_item,
+
+ .active_state = path_active_state,
+ .sub_state_to_string = path_sub_state_to_string,
+
+ .trigger_notify = path_trigger_notify,
+
+ .reset_failed = path_reset_failed,
+
+ .bus_set_property = bus_path_set_property,
+
+ .can_start = path_can_start,
+};
+
+const ActivationDetailsVTable activation_details_path_vtable = {
+ .object_size = sizeof(ActivationDetailsPath),
+
+ .done = activation_details_path_done,
+ .serialize = activation_details_path_serialize,
+ .deserialize = activation_details_path_deserialize,
+ .append_env = activation_details_path_append_env,
+ .append_pair = activation_details_path_append_pair,
+};
diff --git a/src/core/path.h b/src/core/path.h
new file mode 100644
index 0000000..cb5b662
--- /dev/null
+++ b/src/core/path.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Path Path;
+typedef struct PathSpec PathSpec;
+typedef struct ActivationDetailsPath ActivationDetailsPath;
+
+#include "unit.h"
+
+typedef enum PathType {
+ PATH_EXISTS,
+ PATH_EXISTS_GLOB,
+ PATH_DIRECTORY_NOT_EMPTY,
+ PATH_CHANGED,
+ PATH_MODIFIED,
+ _PATH_TYPE_MAX,
+ _PATH_TYPE_INVALID = -EINVAL,
+} PathType;
+
+typedef struct PathSpec {
+ Unit *unit;
+
+ char *path;
+
+ sd_event_source *event_source;
+
+ LIST_FIELDS(struct PathSpec, spec);
+
+ PathType type;
+ int inotify_fd;
+ int primary_wd;
+
+ bool previous_exists;
+} PathSpec;
+
+int path_spec_watch(PathSpec *s, sd_event_io_handler_t handler);
+void path_spec_unwatch(PathSpec *s);
+int path_spec_fd_event(PathSpec *s, uint32_t events);
+void path_spec_done(PathSpec *s);
+
+static inline bool path_spec_owns_inotify_fd(PathSpec *s, int fd) {
+ return s->inotify_fd == fd;
+}
+
+typedef enum PathResult {
+ PATH_SUCCESS,
+ PATH_FAILURE_RESOURCES,
+ PATH_FAILURE_START_LIMIT_HIT,
+ PATH_FAILURE_UNIT_START_LIMIT_HIT,
+ PATH_FAILURE_TRIGGER_LIMIT_HIT,
+ _PATH_RESULT_MAX,
+ _PATH_RESULT_INVALID = -EINVAL,
+} PathResult;
+
+struct Path {
+ Unit meta;
+
+ LIST_HEAD(PathSpec, specs);
+
+ PathState state, deserialized_state;
+
+ bool make_directory;
+ mode_t directory_mode;
+
+ PathResult result;
+
+ RateLimit trigger_limit;
+
+ sd_event_source *trigger_notify_event_source;
+};
+
+struct ActivationDetailsPath {
+ ActivationDetails meta;
+ char *trigger_path_filename;
+};
+
+void path_free_specs(Path *p);
+
+extern const UnitVTable path_vtable;
+extern const ActivationDetailsVTable activation_details_path_vtable;
+
+const char* path_type_to_string(PathType i) _const_;
+PathType path_type_from_string(const char *s) _pure_;
+
+const char* path_result_to_string(PathResult i) _const_;
+PathResult path_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(PATH, Path);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_PATH, ActivationDetailsPath, PATH);
diff --git a/src/core/restrict-ifaces.c b/src/core/restrict-ifaces.c
new file mode 100644
index 0000000..4dd8656
--- /dev/null
+++ b/src/core/restrict-ifaces.c
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "restrict-ifaces.h"
+#include "netlink-util.h"
+
+#if BPF_FRAMEWORK
+/* libbpf, clang and llc compile time dependencies are satisfied */
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf-util.h"
+#include "bpf/restrict_ifaces/restrict-ifaces-skel.h"
+
+static struct restrict_ifaces_bpf *restrict_ifaces_bpf_free(struct restrict_ifaces_bpf *obj) {
+ restrict_ifaces_bpf__destroy(obj);
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_ifaces_bpf *, restrict_ifaces_bpf_free);
+
+static int prepare_restrict_ifaces_bpf(
+ Unit* u,
+ bool is_allow_list,
+ const Set *restrict_network_interfaces,
+ struct restrict_ifaces_bpf **ret_object) {
+
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ char *iface;
+ int r, map_fd;
+
+ assert(ret_object);
+
+ obj = restrict_ifaces_bpf__open();
+ if (!obj)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "restrict-interfaces: Failed to open BPF object: %m");
+
+ r = sym_bpf_map__set_max_entries(obj->maps.sd_restrictif, MAX(set_size(restrict_network_interfaces), 1u));
+ if (r != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r,
+ "restrict-interfaces: Failed to resize BPF map '%s': %m",
+ sym_bpf_map__name(obj->maps.sd_restrictif));
+
+ obj->rodata->is_allow_list = is_allow_list;
+
+ r = restrict_ifaces_bpf__load(obj);
+ if (r != 0)
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, r, "restrict-interfaces: Failed to load BPF object: %m");
+
+ map_fd = sym_bpf_map__fd(obj->maps.sd_restrictif);
+
+ SET_FOREACH(iface, restrict_network_interfaces) {
+ uint8_t dummy = 0;
+ int ifindex;
+
+ ifindex = rtnl_resolve_interface(&rtnl, iface);
+ if (ifindex < 0) {
+ log_unit_warning_errno(u, ifindex,
+ "restrict-interfaces: Couldn't find index of network interface '%s', ignoring: %m",
+ iface);
+ continue;
+ }
+
+ if (sym_bpf_map_update_elem(map_fd, &ifindex, &dummy, BPF_ANY))
+ return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno,
+ "restrict-interfaces: Failed to update BPF map '%s' fd: %m",
+ sym_bpf_map__name(obj->maps.sd_restrictif));
+ }
+
+ *ret_object = TAKE_PTR(obj);
+ return 0;
+}
+
+int restrict_network_interfaces_supported(void) {
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+
+ if (!cgroup_bpf_supported())
+ return (supported = false);
+
+ if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SKB, /*opts=*/NULL)) {
+ log_debug("restrict-interfaces: BPF program type cgroup_skb is not supported");
+ return (supported = false);
+ }
+
+ r = prepare_restrict_ifaces_bpf(NULL, true, NULL, &obj);
+ if (r < 0) {
+ log_debug_errno(r, "restrict-interfaces: Failed to load BPF object: %m");
+ return (supported = false);
+ }
+
+ return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i));
+}
+
+static int restrict_network_interfaces_install_impl(Unit *u) {
+ _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL;
+ _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
+ _cleanup_free_ char *cgroup_path = NULL;
+ _cleanup_close_ int cgroup_fd = -EBADF;
+ CGroupContext *cc;
+ int r;
+
+ cc = unit_get_cgroup_context(u);
+ if (!cc)
+ return 0;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to get cgroup path: %m");
+
+ if (!cc->restrict_network_interfaces)
+ return 0;
+
+ r = prepare_restrict_ifaces_bpf(u,
+ cc->restrict_network_interfaces_is_allow_list,
+ cc->restrict_network_interfaces,
+ &obj);
+ if (r < 0)
+ return r;
+
+ cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC | O_DIRECTORY, 0);
+ if (cgroup_fd < 0)
+ return -errno;
+
+ ingress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_i, cgroup_fd);
+ r = sym_libbpf_get_error(ingress_link);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create ingress cgroup link: %m");
+
+ egress_link = sym_bpf_program__attach_cgroup(obj->progs.sd_restrictif_e, cgroup_fd);
+ r = sym_libbpf_get_error(egress_link);
+ if (r != 0)
+ return log_unit_error_errno(u, r, "restrict-interfaces: Failed to create egress cgroup link: %m");
+
+ u->restrict_ifaces_ingress_bpf_link = TAKE_PTR(ingress_link);
+ u->restrict_ifaces_egress_bpf_link = TAKE_PTR(egress_link);
+
+ return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+ int r = restrict_network_interfaces_install_impl(u);
+ fdset_close(u->initial_restric_ifaces_link_fds);
+ return r;
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+
+ r = bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_ingress_bpf_link);
+ if (r < 0)
+ return r;
+
+ return bpf_serialize_link(f, fds, "restrict-ifaces-bpf-fd", u->restrict_ifaces_egress_bpf_link);
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+ int r;
+
+ assert(u);
+
+ if (!u->initial_restric_ifaces_link_fds) {
+ u->initial_restric_ifaces_link_fds = fdset_new();
+ if (!u->initial_restric_ifaces_link_fds)
+ return log_oom();
+ }
+
+ r = fdset_put(u->initial_restric_ifaces_link_fds, fd);
+ if (r < 0)
+ return log_unit_error_errno(u, r,
+ "restrict-interfaces: Failed to put restrict-ifaces-bpf-fd %d to restored fdset: %m", fd);
+
+ return 0;
+}
+
+#else /* ! BPF_FRAMEWORK */
+int restrict_network_interfaces_supported(void) {
+ return 0;
+}
+
+int restrict_network_interfaces_install(Unit *u) {
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "restrict-interfaces: Failed to install; BPF programs built from source code are not supported: %m");
+}
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds) {
+ return 0;
+}
+
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd) {
+ return 0;
+}
+#endif
diff --git a/src/core/restrict-ifaces.h b/src/core/restrict-ifaces.h
new file mode 100644
index 0000000..6e7a824
--- /dev/null
+++ b/src/core/restrict-ifaces.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "fdset.h"
+#include "unit.h"
+
+typedef struct Unit Unit;
+
+int restrict_network_interfaces_supported(void);
+int restrict_network_interfaces_install(Unit *u);
+
+int serialize_restrict_network_interfaces(Unit *u, FILE *f, FDSet *fds);
+
+/* Add BPF link fd created before daemon-reload or daemon-reexec.
+ * FDs will be closed at the end of restrict_network_interfaces_install. */
+int restrict_network_interfaces_add_initial_link_fd(Unit *u, int fd);
diff --git a/src/core/scope.c b/src/core/scope.c
new file mode 100644
index 0000000..e4c27da
--- /dev/null
+++ b/src/core/scope.c
@@ -0,0 +1,829 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "cgroup-setup.h"
+#include "dbus-scope.h"
+#include "dbus-unit.h"
+#include "exit-status.h"
+#include "load-dropin.h"
+#include "log.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "scope.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
+ [SCOPE_DEAD] = UNIT_INACTIVE,
+ [SCOPE_START_CHOWN] = UNIT_ACTIVATING,
+ [SCOPE_RUNNING] = UNIT_ACTIVE,
+ [SCOPE_ABANDONED] = UNIT_ACTIVE,
+ [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SCOPE_FAILED] = UNIT_FAILED,
+};
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+
+static void scope_init(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->runtime_max_usec = USEC_INFINITY;
+ s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec;
+ u->ignore_on_isolate = true;
+ s->user = s->group = NULL;
+ s->oom_policy = _OOM_POLICY_INVALID;
+}
+
+static void scope_done(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(u);
+
+ s->controller = mfree(s->controller);
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ s->user = mfree(s->user);
+ s->group = mfree(s->group);
+}
+
+static usec_t scope_running_timeout(Scope *s) {
+ usec_t delta = 0;
+
+ assert(s);
+
+ if (s->runtime_rand_extra_usec != 0) {
+ delta = random_u64_range(s->runtime_rand_extra_usec);
+ log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+ }
+
+ return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+ s->runtime_max_usec),
+ delta);
+}
+
+static int scope_arm_timer(Scope *s, bool relative, usec_t usec) {
+ assert(s);
+
+ return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, scope_dispatch_timer);
+}
+
+static void scope_set_state(Scope *s, ScopeState state) {
+ ScopeState old_state;
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_START_CHOWN, SCOPE_RUNNING))
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ if (!IN_SET(old_state, SCOPE_DEAD, SCOPE_FAILED) && IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
+ unit_unwatch_all_pids(UNIT(s));
+ unit_dequeue_rewatch_pids(UNIT(s));
+ }
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int scope_add_default_dependencies(Scope *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Make sure scopes are unloaded on shutdown */
+ r = unit_add_two_dependencies_by_name(
+ UNIT(s),
+ UNIT_BEFORE, UNIT_CONFLICTS,
+ SPECIAL_SHUTDOWN_TARGET, true,
+ UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int scope_verify(Scope *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (set_isempty(UNIT(s)->pids) &&
+ !MANAGER_IS_RELOADING(UNIT(s)->manager) &&
+ !unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT), "Scope has no PIDs. Refusing.");
+
+ return 0;
+}
+
+static int scope_load_init_scope(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return 0;
+
+ u->transient = true;
+ u->perpetual = true;
+
+ /* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we
+ * synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ /* Prettify things, if we can. */
+ if (!u->description)
+ u->description = strdup("System and Service Manager");
+ if (!u->documentation)
+ (void) strv_extend(&u->documentation, "man:systemd(1)");
+
+ return 1;
+}
+
+static int scope_add_extras(Scope *s) {
+ int r;
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ if (s->oom_policy < 0)
+ s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy;
+
+ s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+ return scope_add_default_dependencies(s);
+}
+
+static int scope_load(Unit *u) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+ /* Refuse to load non-transient scope units, but allow them while reloading. */
+ return -ENOENT;
+
+ r = scope_load_init_scope(u);
+ if (r < 0)
+ return r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ r = scope_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return scope_verify(s);
+}
+
+static usec_t scope_coldplug_timeout(Scope *s) {
+ assert(s);
+
+ switch (s->deserialized_state) {
+
+ case SCOPE_RUNNING:
+ return scope_running_timeout(s);
+
+ case SCOPE_STOP_SIGKILL:
+ case SCOPE_STOP_SIGTERM:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+ default:
+ return USEC_INFINITY;
+ }
+}
+
+static int scope_coldplug(Unit *u) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SCOPE_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ r = scope_arm_timer(s, /* relative= */ false, scope_coldplug_timeout(s));
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) {
+ if (u->pids) {
+ PidRef *pid;
+
+ SET_FOREACH(pid, u->pids) {
+ r = unit_watch_pidref(u, pid, /* exclusive= */ false);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+ } else
+ (void) unit_enqueue_rewatch_pids(u);
+ }
+
+ bus_scope_track_controller(s);
+
+ scope_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static void scope_dump(Unit *u, FILE *f, const char *prefix) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+ assert(f);
+
+ fprintf(f,
+ "%sScope State: %s\n"
+ "%sResult: %s\n"
+ "%sRuntimeMaxSec: %s\n"
+ "%sRuntimeRandomizedExtraSec: %s\n"
+ "%sOOMPolicy: %s\n",
+ prefix, scope_state_to_string(s->state),
+ prefix, scope_result_to_string(s->result),
+ prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+ prefix, oom_policy_to_string(s->oom_policy));
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+}
+
+static void scope_enter_dead(Scope *s, ScopeResult f) {
+ assert(s);
+
+ if (s->result == SCOPE_SUCCESS)
+ s->result = f;
+
+ unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result));
+ scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
+}
+
+static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
+ bool skip_signal = false;
+ int r;
+
+ assert(s);
+
+ if (s->result == SCOPE_SUCCESS)
+ s->result = f;
+
+ /* Before sending any signal, make sure we track all members of this cgroup */
+ (void) unit_watch_all_pids(UNIT(s));
+
+ /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+ * died now */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
+ * directly into SIGTERM berserk mode */
+ if (state == SCOPE_STOP_SIGTERM)
+ skip_signal = bus_scope_send_request_stop(s) > 0;
+
+ if (skip_signal)
+ r = 1; /* wait */
+ else {
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ state != SCOPE_STOP_SIGTERM ? KILL_KILL :
+ s->was_abandoned ? KILL_TERMINATE_AND_LOG :
+ KILL_TERMINATE,
+ /* main_pid= */ NULL,
+ /* control_pid= */ NULL,
+ /* main_pid_alien= */ false);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+ goto fail;
+ }
+ }
+
+ if (r > 0) {
+ r = scope_arm_timer(s, /* relative= */ true, s->timeout_stop_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ scope_set_state(s, state);
+ } else if (state == SCOPE_STOP_SIGTERM)
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS);
+ else
+ scope_enter_dead(s, SCOPE_SUCCESS);
+
+ return;
+
+fail:
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+}
+
+static int scope_enter_start_chown(Scope *s) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+ assert(s->user);
+
+ r = scope_arm_timer(s, /* relative= */ true, u->manager->defaults.timeout_start_usec);
+ if (r < 0)
+ return r;
+
+ r = unit_fork_helper_process(u, "(sd-chown-cgroup)", &pidref);
+ if (r < 0)
+ goto fail;
+
+ if (r == 0) {
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+
+ if (!isempty(s->user)) {
+ const char *user = s->user;
+
+ r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve user \"%s\": %m", user);
+ _exit(EXIT_USER);
+ }
+ }
+
+ if (!isempty(s->group)) {
+ const char *group = s->group;
+
+ r = get_group_creds(&group, &gid, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve group \"%s\": %m", group);
+ _exit(EXIT_GROUP);
+ }
+ }
+
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, uid, gid);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to adjust control group access: %m");
+ _exit(EXIT_CGROUP);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+ if (r < 0)
+ goto fail;
+
+ scope_set_state(s, SCOPE_START_CHOWN);
+
+ return 1;
+fail:
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int scope_enter_running(Scope *s) {
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+
+ (void) bus_scope_track_controller(s);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ unit_export_state_files(u);
+
+ r = unit_attach_pids_to_cgroup(u, u->pids, NULL);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to add PIDs to scope's control group: %m");
+ goto fail;
+ }
+ if (r == 0) {
+ r = log_unit_warning_errno(u, SYNTHETIC_ERRNO(ECHILD), "No PIDs left to attach to the scope's control group, refusing.");
+ goto fail;
+ }
+ log_unit_debug(u, "%i %s added to scope's control group.", r, r == 1 ? "process" : "processes");
+
+ s->result = SCOPE_SUCCESS;
+
+ scope_set_state(s, SCOPE_RUNNING);
+
+ /* Set the maximum runtime timeout. */
+ scope_arm_timer(s, /* relative= */ false, scope_running_timeout(s));
+
+ /* On unified we use proper notifications hence we can unwatch the PIDs
+ * we just attached to the scope. This can also be done on legacy as
+ * we're going to update the list of the processes we watch with the
+ * PIDs currently in the scope anyway. */
+ unit_unwatch_all_pids(u);
+
+ /* Start watching the PIDs currently in the scope (legacy hierarchy only) */
+ (void) unit_enqueue_rewatch_pids(u);
+ return 1;
+
+fail:
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+ return r;
+}
+
+static int scope_start(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE))
+ return -EPERM;
+
+ if (s->state == SCOPE_FAILED)
+ return -EPERM;
+
+ /* We can't fulfill this right now, please try again later */
+ if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ return -EAGAIN;
+
+ assert(s->state == SCOPE_DEAD);
+
+ if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
+ return -ENOENT;
+
+ (void) unit_realize_cgroup(u);
+ (void) unit_reset_accounting(u);
+
+ /* We check only for User= option to keep behavior consistent with logic for service units,
+ * i.e. having 'Delegate=true Group=foo' w/o specifying User= has no effect. */
+ if (s->user && unit_cgroup_delegate(u))
+ return scope_enter_start_chown(s);
+
+ return scope_enter_running(s);
+}
+
+static int scope_stop(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ return 0;
+
+ assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED));
+
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
+ return 1;
+}
+
+static void scope_reset_failed(Unit *u) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (s->state == SCOPE_FAILED)
+ scope_set_state(s, SCOPE_DEAD);
+
+ s->result = SCOPE_SUCCESS;
+}
+
+static int scope_get_timeout(Unit *u, usec_t *timeout) {
+ Scope *s = SCOPE(u);
+ usec_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Scope *s = SCOPE(u);
+ PidRef *pid;
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", scope_state_to_string(s->state));
+ (void) serialize_bool(f, "was-abandoned", s->was_abandoned);
+
+ if (s->controller)
+ (void) serialize_item(f, "controller", s->controller);
+
+ SET_FOREACH(pid, u->pids)
+ serialize_pidref(f, fds, "pids", pid);
+
+ return 0;
+}
+
+static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Scope *s = SCOPE(u);
+ int r;
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ ScopeState state;
+
+ state = scope_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else if (streq(key, "was-abandoned")) {
+ int k;
+
+ k = parse_boolean(value);
+ if (k < 0)
+ log_unit_debug(u, "Failed to parse boolean value: %s", value);
+ else
+ s->was_abandoned = k;
+ } else if (streq(key, "controller")) {
+
+ r = free_and_strdup(&s->controller, value);
+ if (r < 0)
+ return log_oom();
+
+ } else if (streq(key, "pids")) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+ if (deserialize_pidref(fds, value, &pidref) >= 0) {
+ r = unit_watch_pidref(u, &pidref, /* exclusive= */ false);
+ if (r < 0)
+ log_unit_debug(u, "Failed to watch PID, ignoring: %s", value);
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void scope_notify_cgroup_empty_event(Unit *u) {
+ Scope *s = SCOPE(u);
+ assert(u);
+
+ log_unit_debug(u, "cgroup is empty");
+
+ if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+ scope_enter_dead(s, SCOPE_SUCCESS);
+}
+
+static void scope_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+ Scope *s = SCOPE(u);
+
+ if (managed_oom)
+ log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+ else
+ log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+ if (s->oom_policy == OOM_CONTINUE)
+ return;
+
+ switch (s->state) {
+
+ case SCOPE_START_CHOWN:
+ case SCOPE_RUNNING:
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_OOM_KILL);
+ break;
+
+ case SCOPE_STOP_SIGTERM:
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_OOM_KILL);
+ break;
+
+ case SCOPE_STOP_SIGKILL:
+ if (s->result == SCOPE_SUCCESS)
+ s->result = SCOPE_FAILURE_OOM_KILL;
+ break;
+ /* SCOPE_DEAD, SCOPE_ABANDONED, and SCOPE_FAILED end up in default */
+ default:
+ ;
+ }
+}
+
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Scope *s = SCOPE(u);
+
+ assert(s);
+
+ if (s->state == SCOPE_START_CHOWN) {
+ if (!is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
+ else
+ scope_enter_running(s);
+ return;
+ }
+
+ /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
+ * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
+ * process we watched was probably the parent of them, and they are hence now our children. */
+
+ (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Scope *s = SCOPE(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SCOPE_RUNNING:
+ log_unit_warning(UNIT(s), "Scope reached runtime time limit. Stopping.");
+ scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ case SCOPE_STOP_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ }
+
+ break;
+
+ case SCOPE_STOP_SIGKILL:
+ log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ case SCOPE_START_CHOWN:
+ log_unit_warning(UNIT(s), "User lookup timed out. Entering failed state.");
+ scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int scope_abandon(Scope *s) {
+ assert(s);
+
+ if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
+ return -EPERM;
+
+ if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
+ return -ESTALE;
+
+ s->was_abandoned = true;
+
+ s->controller = mfree(s->controller);
+ s->controller_track = sd_bus_track_unref(s->controller_track);
+
+ scope_set_state(s, SCOPE_ABANDONED);
+
+ /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
+ * the remaining processes will be sooner or later reassigned to us as parent. */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ return 0;
+}
+
+static UnitActiveState scope_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SCOPE(u)->state];
+}
+
+static const char *scope_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return scope_state_to_string(SCOPE(u)->state);
+}
+
+static void scope_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ /* Let's unconditionally add the "init.scope" special unit
+ * that encapsulates PID 1. Note that PID 1 already is in the
+ * cgroup for this, we hence just need to allocate the object
+ * for it and that's it. */
+
+ u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u);
+ if (r < 0) {
+ log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m");
+ return;
+ }
+ }
+
+ u->transient = true;
+ u->perpetual = true;
+ SCOPE(u)->deserialized_state = SCOPE_RUNNING;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+ /* Enqueue an explicit cgroup realization here. Unlike other cgroups this one already exists and is
+ * populated (by us, after all!) already, even when we are not in a reload cycle. Hence we cannot
+ * apply the settings at creation time anymore, but let's at least apply them asynchronously. */
+ unit_add_to_cgroup_realize_queue(u);
+}
+
+static const char* const scope_result_table[_SCOPE_RESULT_MAX] = {
+ [SCOPE_SUCCESS] = "success",
+ [SCOPE_FAILURE_RESOURCES] = "resources",
+ [SCOPE_FAILURE_TIMEOUT] = "timeout",
+ [SCOPE_FAILURE_OOM_KILL] = "oom-kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult);
+
+const UnitVTable scope_vtable = {
+ .object_size = sizeof(Scope),
+ .cgroup_context_offset = offsetof(Scope, cgroup_context),
+ .kill_context_offset = offsetof(Scope, kill_context),
+
+ .sections =
+ "Unit\0"
+ "Scope\0"
+ "Install\0",
+ .private_section = "Scope",
+
+ .can_transient = true,
+ .can_delegate = true,
+ .can_fail = true,
+ .once_only = true,
+ .can_set_managed_oom = true,
+
+ .init = scope_init,
+ .load = scope_load,
+ .done = scope_done,
+
+ .coldplug = scope_coldplug,
+
+ .dump = scope_dump,
+
+ .start = scope_start,
+ .stop = scope_stop,
+
+ .freeze = unit_freeze_vtable_common,
+ .thaw = unit_thaw_vtable_common,
+
+ .get_timeout = scope_get_timeout,
+
+ .serialize = scope_serialize,
+ .deserialize_item = scope_deserialize_item,
+
+ .active_state = scope_active_state,
+ .sub_state_to_string = scope_sub_state_to_string,
+
+ .sigchld_event = scope_sigchld_event,
+
+ .reset_failed = scope_reset_failed,
+
+ .notify_cgroup_empty = scope_notify_cgroup_empty_event,
+ .notify_cgroup_oom = scope_notify_cgroup_oom_event,
+
+ .bus_set_property = bus_scope_set_property,
+ .bus_commit_properties = bus_scope_commit_properties,
+
+ .enumerate_perpetual = scope_enumerate_perpetual,
+};
diff --git a/src/core/scope.h b/src/core/scope.h
new file mode 100644
index 0000000..c9574a3
--- /dev/null
+++ b/src/core/scope.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Scope Scope;
+
+#include "cgroup.h"
+#include "kill.h"
+#include "unit.h"
+
+typedef enum ScopeResult {
+ SCOPE_SUCCESS,
+ SCOPE_FAILURE_RESOURCES,
+ SCOPE_FAILURE_TIMEOUT,
+ SCOPE_FAILURE_OOM_KILL,
+ _SCOPE_RESULT_MAX,
+ _SCOPE_RESULT_INVALID = -EINVAL,
+} ScopeResult;
+
+struct Scope {
+ Unit meta;
+
+ CGroupContext cgroup_context;
+ KillContext kill_context;
+
+ ScopeState state, deserialized_state;
+ ScopeResult result;
+
+ usec_t runtime_max_usec;
+ usec_t runtime_rand_extra_usec;
+ usec_t timeout_stop_usec;
+
+ char *controller;
+ sd_bus_track *controller_track;
+
+ bool was_abandoned;
+
+ sd_event_source *timer_event_source;
+
+ char *user;
+ char *group;
+
+ OOMPolicy oom_policy;
+};
+
+extern const UnitVTable scope_vtable;
+
+int scope_abandon(Scope *s);
+
+const char* scope_result_to_string(ScopeResult i) _const_;
+ScopeResult scope_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SCOPE, Scope);
diff --git a/src/core/selinux-access.c b/src/core/selinux-access.c
new file mode 100644
index 0000000..62181a6
--- /dev/null
+++ b/src/core/selinux-access.c
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "selinux-access.h"
+
+#if HAVE_SELINUX
+
+#include <errno.h>
+#include <selinux/avc.h>
+#include <selinux/selinux.h>
+#if HAVE_AUDIT
+#include <libaudit.h>
+#endif
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "audit-fd.h"
+#include "bus-util.h"
+#include "errno-util.h"
+#include "format-util.h"
+#include "log.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+
+static bool initialized = false;
+
+struct audit_info {
+ sd_bus_creds *creds;
+ const char *path;
+ const char *cmdline;
+ const char *function;
+};
+
+/*
+ Any time an access gets denied this callback will be called
+ with the audit data. We then need to just copy the audit data into the msgbuf.
+*/
+static int audit_callback(
+ void *auditdata,
+ security_class_t cls,
+ char *msgbuf,
+ size_t msgbufsize) {
+
+ const struct audit_info *audit = auditdata;
+ uid_t uid = 0, login_uid = 0;
+ gid_t gid = 0;
+ char login_uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+ char uid_buf[DECIMAL_STR_MAX(uid_t) + 1] = "n/a";
+ char gid_buf[DECIMAL_STR_MAX(gid_t) + 1] = "n/a";
+
+ if (sd_bus_creds_get_audit_login_uid(audit->creds, &login_uid) >= 0)
+ xsprintf(login_uid_buf, UID_FMT, login_uid);
+ if (sd_bus_creds_get_euid(audit->creds, &uid) >= 0)
+ xsprintf(uid_buf, UID_FMT, uid);
+ if (sd_bus_creds_get_egid(audit->creds, &gid) >= 0)
+ xsprintf(gid_buf, GID_FMT, gid);
+
+ (void) snprintf(msgbuf, msgbufsize,
+ "auid=%s uid=%s gid=%s%s%s%s%s%s%s%s%s%s",
+ login_uid_buf, uid_buf, gid_buf,
+ audit->path ? " path=\"" : "", strempty(audit->path), audit->path ? "\"" : "",
+ audit->cmdline ? " cmdline=\"" : "", strempty(audit->cmdline), audit->cmdline ? "\"" : "",
+ audit->function ? " function=\"" : "", strempty(audit->function), audit->function ? "\"" : "");
+
+ return 0;
+}
+
+static int callback_type_to_priority(int type) {
+ switch (type) {
+
+ case SELINUX_ERROR:
+ return LOG_ERR;
+
+ case SELINUX_WARNING:
+ return LOG_WARNING;
+
+ case SELINUX_INFO:
+ return LOG_INFO;
+
+ case SELINUX_AVC:
+ default:
+ return LOG_NOTICE;
+ }
+}
+
+/*
+ libselinux uses this callback when access gets denied or other
+ events happen. If audit is turned on, messages will be reported
+ using audit netlink, otherwise they will be logged using the usual
+ channels.
+
+ Code copied from dbus and modified.
+*/
+_printf_(2, 3) static int log_callback(int type, const char *fmt, ...) {
+ va_list ap;
+ const char *fmt2;
+
+#if HAVE_AUDIT
+ int fd;
+
+ fd = get_audit_fd();
+
+ if (fd >= 0) {
+ _cleanup_free_ char *buf = NULL;
+ int r;
+
+ va_start(ap, fmt);
+ r = vasprintf(&buf, fmt, ap);
+ va_end(ap);
+
+ if (r >= 0) {
+ if (type == SELINUX_AVC)
+ audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_AVC, buf, NULL, NULL, NULL, getuid());
+ else if (type == SELINUX_ERROR)
+ audit_log_user_avc_message(get_audit_fd(), AUDIT_USER_SELINUX_ERR, buf, NULL, NULL, NULL, getuid());
+
+ return 0;
+ }
+ }
+#endif
+
+ fmt2 = strjoina("selinux: ", fmt);
+
+ va_start(ap, fmt);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ log_internalv(LOG_AUTH | callback_type_to_priority(type),
+ 0, PROJECT_FILE, __LINE__, __func__,
+ fmt2, ap);
+ REENABLE_WARNING;
+ va_end(ap);
+
+ return 0;
+}
+
+static int access_init(sd_bus_error *error) {
+ int r;
+
+ if (!mac_selinux_use())
+ return 0;
+
+ if (initialized)
+ return 1;
+
+ if (avc_open(NULL, 0) != 0) {
+ r = -errno; /* Save original errno for later */
+
+ bool enforce = security_getenforce() != 0;
+ log_full_errno(enforce ? LOG_ERR : LOG_WARNING, r, "Failed to open the SELinux AVC: %m");
+
+ /* If enforcement isn't on, then let's suppress this error, and just don't do any AVC checks.
+ * The warning we printed is hence all the admin will see. */
+ if (!enforce)
+ return 0;
+
+ /* Return an access denied error based on the original errno, if we couldn't load the AVC but
+ * enforcing mode was on, or we couldn't determine whether it is one. */
+ errno = -r;
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to open the SELinux AVC: %m");
+ }
+
+ selinux_set_callback(SELINUX_CB_AUDIT, (union selinux_callback) { .func_audit = audit_callback });
+ selinux_set_callback(SELINUX_CB_LOG, (union selinux_callback) { .func_log = log_callback });
+
+ initialized = true;
+ return 1;
+}
+
+/*
+ This function communicates with the kernel to check whether or not it should
+ allow the access.
+ If the machine is in permissive mode it will return ok. Audit messages will
+ still be generated if the access would be denied in enforcing mode.
+*/
+int mac_selinux_access_check_internal(
+ sd_bus_message *message,
+ const char *unit_path,
+ const char *unit_context,
+ const char *permission,
+ const char *function,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ const char *tclass, *scon, *acon;
+ _cleanup_free_ char *cl = NULL;
+ _cleanup_freecon_ char *fcon = NULL;
+ char **cmdline = NULL;
+ bool enforce;
+ int r = 0;
+
+ assert(message);
+ assert(permission);
+ assert(function);
+ assert(error);
+
+ r = access_init(error);
+ if (r <= 0)
+ return r;
+
+ /* delay call until we checked in `access_init()` if SELinux is actually enabled */
+ enforce = mac_selinux_enforcing();
+
+ r = sd_bus_query_sender_creds(
+ message,
+ SD_BUS_CREDS_PID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EGID|
+ SD_BUS_CREDS_CMDLINE|SD_BUS_CREDS_AUDIT_LOGIN_UID|
+ SD_BUS_CREDS_SELINUX_CONTEXT|
+ SD_BUS_CREDS_AUGMENT /* get more bits from /proc */,
+ &creds);
+ if (r < 0)
+ return r;
+
+ /* The SELinux context is something we really should have gotten directly from the message or sender,
+ * and not be an augmented field. If it was augmented we cannot use it for authorization, since this
+ * is racy and vulnerable. Let's add an extra check, just in case, even though this really shouldn't
+ * be possible. */
+ assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_SELINUX_CONTEXT) == 0, -EPERM);
+
+ r = sd_bus_creds_get_selinux_context(creds, &scon);
+ if (r < 0)
+ return r;
+
+ if (unit_context) {
+ /* Nice! The unit comes with a SELinux context read from the unit file */
+ acon = unit_context;
+ tclass = "service";
+ } else {
+ /* If no unit context is known, use our own */
+ if (getcon_raw(&fcon) < 0) {
+ log_warning_errno(errno, "SELinux getcon_raw() failed%s (perm=%s): %m",
+ enforce ? "" : ", ignoring",
+ permission);
+ if (!enforce)
+ return 0;
+
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Failed to get current context: %m");
+ }
+ if (!fcon) {
+ if (!enforce)
+ return 0;
+
+ return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "We appear not to have any SELinux context: %m");
+ }
+
+ acon = fcon;
+ tclass = "system";
+ }
+
+ sd_bus_creds_get_cmdline(creds, &cmdline);
+ cl = strv_join(cmdline, " ");
+
+ struct audit_info audit_info = {
+ .creds = creds,
+ .path = unit_path,
+ .cmdline = cl,
+ .function = function,
+ };
+
+ r = selinux_check_access(scon, acon, tclass, permission, &audit_info);
+ if (r < 0) {
+ errno = -(r = errno_or_else(EPERM));
+
+ if (enforce)
+ sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "SELinux policy denies access: %m");
+ }
+
+ log_full_errno_zerook(LOG_DEBUG, r,
+ "SELinux access check scon=%s tcon=%s tclass=%s perm=%s state=%s function=%s path=%s cmdline=%s: %m",
+ scon, acon, tclass, permission, enforce ? "enforcing" : "permissive", function, strna(unit_path), strna(empty_to_null(cl)));
+ return enforce ? r : 0;
+}
+
+#else /* HAVE_SELINUX */
+
+int mac_selinux_access_check_internal(
+ sd_bus_message *message,
+ const char *unit_path,
+ const char *unit_label,
+ const char *permission,
+ const char *function,
+ sd_bus_error *error) {
+
+ return 0;
+}
+
+#endif /* HAVE_SELINUX */
diff --git a/src/core/selinux-access.h b/src/core/selinux-access.h
new file mode 100644
index 0000000..dc8da9e
--- /dev/null
+++ b/src/core/selinux-access.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "manager.h"
+
+int mac_selinux_access_check_internal(sd_bus_message *message, const char *unit_path, const char *unit_label, const char *permission, const char *function, sd_bus_error *error);
+
+#define mac_selinux_access_check(message, permission, error) \
+ mac_selinux_access_check_internal((message), NULL, NULL, (permission), __func__, (error))
+
+#define mac_selinux_unit_access_check(unit, message, permission, error) \
+ mac_selinux_access_check_internal((message), (unit)->fragment_path, (unit)->access_selinux_context, (permission), __func__, (error))
diff --git a/src/core/selinux-setup.c b/src/core/selinux-setup.c
new file mode 100644
index 0000000..bc1a249
--- /dev/null
+++ b/src/core/selinux-setup.c
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#if HAVE_SELINUX
+#include <selinux/selinux.h>
+#endif
+
+#include "sd-messages.h"
+
+#include "initrd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "selinux-setup.h"
+#include "selinux-util.h"
+#include "string-util.h"
+#include "time-util.h"
+
+#if HAVE_SELINUX
+_printf_(2,3)
+static int null_log(int type, const char *fmt, ...) {
+ return 0;
+}
+#endif
+
+int mac_selinux_setup(bool *loaded_policy) {
+
+#if HAVE_SELINUX
+ int enforce = 0;
+ usec_t before_load, after_load;
+ char *con;
+ int r;
+ bool initialized;
+
+ assert(loaded_policy);
+
+ /* Turn off all of SELinux' own logging, we want to do that */
+ selinux_set_callback(SELINUX_CB_LOG, (const union selinux_callback) { .func_log = null_log });
+
+ /* Don't load policy in the initrd if we don't appear to have it. For the real root, we check below
+ * if we've already loaded policy, and return gracefully. */
+ if (in_initrd() && access(selinux_path(), F_OK) < 0)
+ return 0;
+
+ /* Already initialized by somebody else? */
+ r = getcon_raw(&con);
+ /* getcon_raw can return 0, and still give us a NULL pointer if /proc/self/attr/current is
+ * empty. SELinux guarantees this won't happen, but that file isn't specific to SELinux, and may be
+ * provided by some other arbitrary LSM with different semantics. */
+ if (r == 0 && con) {
+ initialized = !streq(con, "kernel");
+ freecon(con);
+ } else
+ initialized = false;
+
+ /* Make sure we have no fds open while loading the policy and
+ * transitioning */
+ log_close();
+
+ /* Now load the policy */
+ before_load = now(CLOCK_MONOTONIC);
+ r = selinux_init_load_policy(&enforce);
+ if (r == 0) {
+ _cleanup_(mac_selinux_freep) char *label = NULL;
+
+ mac_selinux_retest();
+
+ /* Transition to the new context */
+ r = mac_selinux_get_create_label_from_exe(SYSTEMD_BINARY_PATH, &label);
+ if (r < 0 || !label) {
+ log_open();
+ log_error("Failed to compute init label, ignoring.");
+ } else {
+ r = setcon_raw(label);
+
+ log_open();
+ if (r < 0)
+ log_error("Failed to transition into init label '%s', ignoring.", label);
+ }
+
+ after_load = now(CLOCK_MONOTONIC);
+
+ log_info("Successfully loaded SELinux policy in %s.",
+ FORMAT_TIMESPAN(after_load - before_load, 0));
+
+ *loaded_policy = true;
+
+ } else {
+ log_open();
+
+ if (enforce > 0) {
+ if (!initialized)
+ return log_struct_errno(LOG_EMERG, SYNTHETIC_ERRNO(EIO),
+ LOG_MESSAGE("Failed to load SELinux policy :%m"),
+ "MESSAGE_ID=" SD_MESSAGE_SELINUX_FAILED_STR);
+
+ log_warning("Failed to load new SELinux policy. Continuing with old policy.");
+ } else
+ log_debug("Unable to load SELinux policy. Ignoring.");
+ }
+#endif
+
+ return 0;
+}
diff --git a/src/core/selinux-setup.h b/src/core/selinux-setup.h
new file mode 100644
index 0000000..cdff51d
--- /dev/null
+++ b/src/core/selinux-setup.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+int mac_selinux_setup(bool *loaded_policy);
diff --git a/src/core/service.c b/src/core/service.c
new file mode 100644
index 0000000..060ac08
--- /dev/null
+++ b/src/core/service.c
@@ -0,0 +1,5161 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <math.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "async.h"
+#include "bus-error.h"
+#include "bus-kernel.h"
+#include "bus-util.h"
+#include "chase.h"
+#include "constants.h"
+#include "dbus-service.h"
+#include "dbus-unit.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "manager.h"
+#include "missing_audit.h"
+#include "open-file.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "special.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "utf8.h"
+
+#define service_spawn(...) service_spawn_internal(__func__, __VA_ARGS__)
+
+static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVATING,
+ [SERVICE_START_PRE] = UNIT_ACTIVATING,
+ [SERVICE_START] = UNIT_ACTIVATING,
+ [SERVICE_START_POST] = UNIT_ACTIVATING,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+ [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+ [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
+ [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+/* For Type=idle we never want to delay any other jobs, hence we
+ * consider idle jobs active as soon as we start working on them */
+static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = {
+ [SERVICE_DEAD] = UNIT_INACTIVE,
+ [SERVICE_CONDITION] = UNIT_ACTIVE,
+ [SERVICE_START_PRE] = UNIT_ACTIVE,
+ [SERVICE_START] = UNIT_ACTIVE,
+ [SERVICE_START_POST] = UNIT_ACTIVE,
+ [SERVICE_RUNNING] = UNIT_ACTIVE,
+ [SERVICE_EXITED] = UNIT_ACTIVE,
+ [SERVICE_RELOAD] = UNIT_RELOADING,
+ [SERVICE_RELOAD_SIGNAL] = UNIT_RELOADING,
+ [SERVICE_RELOAD_NOTIFY] = UNIT_RELOADING,
+ [SERVICE_STOP] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_WATCHDOG] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SERVICE_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SERVICE_FAILED] = UNIT_FAILED,
+ [SERVICE_DEAD_BEFORE_AUTO_RESTART] = UNIT_INACTIVE,
+ [SERVICE_FAILED_BEFORE_AUTO_RESTART] = UNIT_FAILED,
+ [SERVICE_DEAD_RESOURCES_PINNED] = UNIT_INACTIVE,
+ [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING,
+ [SERVICE_AUTO_RESTART_QUEUED] = UNIT_ACTIVATING,
+ [SERVICE_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata);
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata);
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f);
+static void service_enter_reload_by_notify(Service *s);
+
+static void service_init(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->timeout_start_usec = u->manager->defaults.timeout_start_usec;
+ s->timeout_stop_usec = u->manager->defaults.timeout_stop_usec;
+ s->timeout_abort_usec = u->manager->defaults.timeout_abort_usec;
+ s->timeout_abort_set = u->manager->defaults.timeout_abort_set;
+ s->restart_usec = u->manager->defaults.restart_usec;
+ s->restart_max_delay_usec = USEC_INFINITY;
+ s->runtime_max_usec = USEC_INFINITY;
+ s->type = _SERVICE_TYPE_INVALID;
+ s->socket_fd = -EBADF;
+ s->stdin_fd = s->stdout_fd = s->stderr_fd = -EBADF;
+ s->guess_main_pid = true;
+ s->main_pid = PIDREF_NULL;
+ s->control_pid = PIDREF_NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ s->exec_context.keyring_mode = MANAGER_IS_SYSTEM(u->manager) ?
+ EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT;
+
+ s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+ s->watchdog_original_usec = USEC_INFINITY;
+
+ s->oom_policy = _OOM_POLICY_INVALID;
+ s->reload_begin_usec = USEC_INFINITY;
+ s->reload_signal = SIGHUP;
+
+ s->fd_store_preserve_mode = EXEC_PRESERVE_RESTART;
+}
+
+static void service_unwatch_control_pid(Service *s) {
+ assert(s);
+
+ if (!pidref_is_set(&s->control_pid))
+ return;
+
+ unit_unwatch_pidref(UNIT(s), &s->control_pid);
+ pidref_done(&s->control_pid);
+}
+
+static void service_unwatch_main_pid(Service *s) {
+ assert(s);
+
+ if (!pidref_is_set(&s->main_pid))
+ return;
+
+ unit_unwatch_pidref(UNIT(s), &s->main_pid);
+ pidref_done(&s->main_pid);
+}
+
+static void service_unwatch_pid_file(Service *s) {
+ if (!s->pid_file_pathspec)
+ return;
+
+ log_unit_debug(UNIT(s), "Stopping watch for PID file %s", s->pid_file_pathspec->path);
+ path_spec_unwatch(s->pid_file_pathspec);
+ path_spec_done(s->pid_file_pathspec);
+ s->pid_file_pathspec = mfree(s->pid_file_pathspec);
+}
+
+static int service_set_main_pidref(Service *s, PidRef *pidref) {
+ int r;
+
+ assert(s);
+
+ /* Takes ownership of the specified pidref on success, but not on failure. */
+
+ if (!pidref_is_set(pidref))
+ return -ESRCH;
+
+ if (pidref->pid <= 1)
+ return -EINVAL;
+
+ if (pidref_is_self(pidref))
+ return -EINVAL;
+
+ if (pidref_equal(&s->main_pid, pidref) && s->main_pid_known) {
+ pidref_done(pidref);
+ return 0;
+ }
+
+ if (!pidref_equal(&s->main_pid, pidref)) {
+ service_unwatch_main_pid(s);
+ exec_status_start(&s->main_exec_status, pidref->pid);
+ }
+
+ s->main_pid = TAKE_PIDREF(*pidref);
+ s->main_pid_known = true;
+
+ r = pidref_is_my_child(&s->main_pid);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Can't determine if process "PID_FMT" is our child, assuming it is not: %m", s->main_pid.pid);
+ else if (r == 0)
+ log_unit_warning(UNIT(s), "Supervising process "PID_FMT" which is not our child. We'll most likely not notice when it exits.", s->main_pid.pid);
+
+ s->main_pid_alien = r <= 0;
+ return 0;
+}
+
+void service_release_socket_fd(Service *s) {
+ assert(s);
+
+ if (s->socket_fd < 0 && !UNIT_ISSET(s->accept_socket) && !s->socket_peer)
+ return;
+
+ log_unit_debug(UNIT(s), "Closing connection socket.");
+
+ /* Undo the effect of service_set_socket_fd(). */
+
+ s->socket_fd = asynchronous_close(s->socket_fd);
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ socket_connection_unref(SOCKET(UNIT_DEREF(s->accept_socket)));
+ unit_ref_unset(&s->accept_socket);
+ }
+
+ s->socket_peer = socket_peer_unref(s->socket_peer);
+}
+
+static void service_override_notify_access(Service *s, NotifyAccess notify_access_override) {
+ assert(s);
+
+ s->notify_access_override = notify_access_override;
+
+ log_unit_debug(UNIT(s), "notify_access=%s", notify_access_to_string(s->notify_access));
+ log_unit_debug(UNIT(s), "notify_access_override=%s", notify_access_to_string(s->notify_access_override));
+}
+
+static void service_stop_watchdog(Service *s) {
+ assert(s);
+
+ s->watchdog_event_source = sd_event_source_disable_unref(s->watchdog_event_source);
+ s->watchdog_timestamp = DUAL_TIMESTAMP_NULL;
+}
+
+static void service_start_watchdog(Service *s) {
+ usec_t watchdog_usec;
+ int r;
+
+ assert(s);
+
+ watchdog_usec = service_get_watchdog_usec(s);
+ if (!timestamp_is_set(watchdog_usec)) {
+ service_stop_watchdog(s);
+ return;
+ }
+
+ if (s->watchdog_event_source) {
+ r = sd_event_source_set_time(s->watchdog_event_source, usec_add(s->watchdog_timestamp.monotonic, watchdog_usec));
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to reset watchdog timer: %m");
+ return;
+ }
+
+ r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ONESHOT);
+ } else {
+ r = sd_event_add_time(
+ UNIT(s)->manager->event,
+ &s->watchdog_event_source,
+ CLOCK_MONOTONIC,
+ usec_add(s->watchdog_timestamp.monotonic, watchdog_usec), 0,
+ service_dispatch_watchdog, s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to add watchdog timer: %m");
+ return;
+ }
+
+ (void) sd_event_source_set_description(s->watchdog_event_source, "service-watchdog");
+
+ /* Let's process everything else which might be a sign
+ * of living before we consider a service died. */
+ r = sd_event_source_set_priority(s->watchdog_event_source, SD_EVENT_PRIORITY_IDLE);
+ }
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to install watchdog timer: %m");
+}
+
+usec_t service_restart_usec_next(Service *s) {
+ unsigned n_restarts_next;
+
+ assert(s);
+
+ /* When the service state is in SERVICE_*_BEFORE_AUTO_RESTART or SERVICE_AUTO_RESTART, we still need
+ * to add 1 to s->n_restarts manually, because s->n_restarts is not updated until a restart job is
+ * enqueued, i.e. state has transitioned to SERVICE_AUTO_RESTART_QUEUED. */
+ n_restarts_next = s->n_restarts + (s->state == SERVICE_AUTO_RESTART_QUEUED ? 0 : 1);
+
+ if (n_restarts_next <= 1 ||
+ s->restart_steps == 0 ||
+ s->restart_usec == 0 ||
+ s->restart_max_delay_usec == USEC_INFINITY ||
+ s->restart_usec >= s->restart_max_delay_usec)
+ return s->restart_usec;
+
+ if (n_restarts_next > s->restart_steps)
+ return s->restart_max_delay_usec;
+
+ /* Enforced in service_verify() and above */
+ assert(s->restart_max_delay_usec > s->restart_usec);
+
+ /* r_i / r_0 = (r_n / r_0) ^ (i / n)
+ * where,
+ * r_0 : initial restart usec (s->restart_usec),
+ * r_i : i-th restart usec (value),
+ * r_n : maximum restart usec (s->restart_max_delay_usec),
+ * i : index of the next step (n_restarts_next - 1)
+ * n : num maximum steps (s->restart_steps) */
+ return (usec_t) (s->restart_usec * powl((long double) s->restart_max_delay_usec / s->restart_usec,
+ (long double) (n_restarts_next - 1) / s->restart_steps));
+}
+
+static void service_extend_event_source_timeout(Service *s, sd_event_source *source, usec_t extended) {
+ usec_t current;
+ int r;
+
+ assert(s);
+
+ /* Extends the specified event source timer to at least the specified time, unless it is already later
+ * anyway. */
+
+ if (!source)
+ return;
+
+ r = sd_event_source_get_time(source, &current);
+ if (r < 0) {
+ const char *desc;
+ (void) sd_event_source_get_description(s->timer_event_source, &desc);
+ log_unit_warning_errno(UNIT(s), r, "Failed to retrieve timeout time for event source '%s', ignoring: %m", strna(desc));
+ return;
+ }
+
+ if (current >= extended) /* Current timeout is already longer, ignore this. */
+ return;
+
+ r = sd_event_source_set_time(source, extended);
+ if (r < 0) {
+ const char *desc;
+ (void) sd_event_source_get_description(s->timer_event_source, &desc);
+ log_unit_warning_errno(UNIT(s), r, "Failed to set timeout time for event source '%s', ignoring %m", strna(desc));
+ }
+}
+
+static void service_extend_timeout(Service *s, usec_t extend_timeout_usec) {
+ usec_t extended;
+
+ assert(s);
+
+ if (!timestamp_is_set(extend_timeout_usec))
+ return;
+
+ extended = usec_add(now(CLOCK_MONOTONIC), extend_timeout_usec);
+
+ service_extend_event_source_timeout(s, s->timer_event_source, extended);
+ service_extend_event_source_timeout(s, s->watchdog_event_source, extended);
+}
+
+static void service_reset_watchdog(Service *s) {
+ assert(s);
+
+ dual_timestamp_now(&s->watchdog_timestamp);
+ service_start_watchdog(s);
+}
+
+static void service_override_watchdog_timeout(Service *s, usec_t watchdog_override_usec) {
+ assert(s);
+
+ s->watchdog_override_enable = true;
+ s->watchdog_override_usec = watchdog_override_usec;
+ service_reset_watchdog(s);
+
+ log_unit_debug(UNIT(s), "watchdog_usec="USEC_FMT, s->watchdog_usec);
+ log_unit_debug(UNIT(s), "watchdog_override_usec="USEC_FMT, s->watchdog_override_usec);
+}
+
+static ServiceFDStore* service_fd_store_unlink(ServiceFDStore *fs) {
+ if (!fs)
+ return NULL;
+
+ if (fs->service) {
+ assert(fs->service->n_fd_store > 0);
+ LIST_REMOVE(fd_store, fs->service->fd_store, fs);
+ fs->service->n_fd_store--;
+ }
+
+ sd_event_source_disable_unref(fs->event_source);
+
+ free(fs->fdname);
+ asynchronous_close(fs->fd);
+ return mfree(fs);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(ServiceFDStore*, service_fd_store_unlink);
+
+static void service_release_fd_store(Service *s) {
+ assert(s);
+
+ if (!s->fd_store)
+ return;
+
+ log_unit_debug(UNIT(s), "Releasing all stored fds");
+
+ while (s->fd_store)
+ service_fd_store_unlink(s->fd_store);
+
+ assert(s->n_fd_store == 0);
+}
+
+static void service_release_stdio_fd(Service *s) {
+ assert(s);
+
+ if (s->stdin_fd < 0 && s->stdout_fd < 0 && s->stdout_fd < 0)
+ return;
+
+ log_unit_debug(UNIT(s), "Releasing stdin/stdout/stderr file descriptors.");
+
+ s->stdin_fd = asynchronous_close(s->stdin_fd);
+ s->stdout_fd = asynchronous_close(s->stdout_fd);
+ s->stderr_fd = asynchronous_close(s->stderr_fd);
+}
+static void service_done(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ open_file_free_many(&s->open_files);
+
+ s->pid_file = mfree(s->pid_file);
+ s->status_text = mfree(s->status_text);
+
+ s->exec_runtime = exec_runtime_free(s->exec_runtime);
+ exec_command_free_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+ s->main_command = NULL;
+
+ exit_status_set_free(&s->restart_prevent_status);
+ exit_status_set_free(&s->restart_force_status);
+ exit_status_set_free(&s->success_status);
+
+ /* This will leak a process, but at least no memory or any of our resources */
+ service_unwatch_main_pid(s);
+ service_unwatch_control_pid(s);
+ service_unwatch_pid_file(s);
+
+ if (s->bus_name) {
+ unit_unwatch_bus_name(u, s->bus_name);
+ s->bus_name = mfree(s->bus_name);
+ }
+
+ s->bus_name_owner = mfree(s->bus_name_owner);
+
+ s->usb_function_descriptors = mfree(s->usb_function_descriptors);
+ s->usb_function_strings = mfree(s->usb_function_strings);
+
+ service_stop_watchdog(s);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ service_release_socket_fd(s);
+ service_release_stdio_fd(s);
+ service_release_fd_store(s);
+}
+
+static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+ ServiceFDStore *fs = ASSERT_PTR(userdata);
+
+ assert(e);
+
+ /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */
+ log_unit_debug(UNIT(fs->service),
+ "Received %s on stored fd %d (%s), closing.",
+ revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP",
+ fs->fd, strna(fs->fdname));
+ service_fd_store_unlink(fs);
+ return 0;
+}
+
+static int service_add_fd_store(Service *s, int fd_in, const char *name, bool do_poll) {
+ _cleanup_(service_fd_store_unlinkp) ServiceFDStore *fs = NULL;
+ _cleanup_(asynchronous_closep) int fd = ASSERT_FD(fd_in);
+ struct stat st;
+ int r;
+
+ /* fd is always consumed even if the function fails. */
+
+ assert(s);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ log_unit_debug(UNIT(s), "Trying to stash fd for dev=" DEVNUM_FORMAT_STR "/inode=%" PRIu64, DEVNUM_FORMAT_VAL(st.st_dev), (uint64_t) st.st_ino);
+
+ if (s->n_fd_store >= s->n_fd_store_max)
+ /* Our store is full. Use this errno rather than E[NM]FILE to distinguish from the case
+ * where systemd itself hits the file limit. */
+ return log_unit_debug_errno(UNIT(s), SYNTHETIC_ERRNO(EXFULL), "Hit fd store limit.");
+
+ LIST_FOREACH(fd_store, i, s->fd_store) {
+ r = same_fd(i->fd, fd);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ log_unit_debug(UNIT(s), "Suppressing duplicate fd %i in fd store.", fd);
+ return 0; /* fd already included */
+ }
+ }
+
+ fs = new(ServiceFDStore, 1);
+ if (!fs)
+ return -ENOMEM;
+
+ *fs = (ServiceFDStore) {
+ .fd = TAKE_FD(fd),
+ .do_poll = do_poll,
+ .fdname = strdup(name ?: "stored"),
+ };
+
+ if (!fs->fdname)
+ return -ENOMEM;
+
+ if (do_poll) {
+ r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fs->fd, 0, on_fd_store_io, fs);
+ if (r < 0 && r != -EPERM) /* EPERM indicates fds that aren't pollable, which is OK */
+ return r;
+ else if (r >= 0)
+ (void) sd_event_source_set_description(fs->event_source, "service-fd-store");
+ }
+
+ fs->service = s;
+ LIST_PREPEND(fd_store, s->fd_store, fs);
+ s->n_fd_store++;
+
+ log_unit_debug(UNIT(s), "Added fd %i (%s) to fd store.", fs->fd, fs->fdname);
+
+ TAKE_PTR(fs);
+ return 1; /* fd newly stored */
+}
+
+static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name, bool do_poll) {
+ int r;
+
+ assert(s);
+
+ for (;;) {
+ int fd;
+
+ fd = fdset_steal_first(fds);
+ if (fd < 0)
+ break;
+
+ r = service_add_fd_store(s, fd, name, do_poll);
+ if (r == -EXFULL)
+ return log_unit_warning_errno(UNIT(s), r,
+ "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.",
+ s->n_fd_store_max);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m");
+ }
+
+ return 0;
+}
+
+static void service_remove_fd_store(Service *s, const char *name) {
+ assert(s);
+ assert(name);
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ if (!streq(fs->fdname, name))
+ continue;
+
+ log_unit_debug(UNIT(s), "Got explicit request to remove fd %i (%s), closing.", fs->fd, name);
+ service_fd_store_unlink(fs);
+ }
+}
+
+static usec_t service_running_timeout(Service *s) {
+ usec_t delta = 0;
+
+ assert(s);
+
+ if (s->runtime_rand_extra_usec != 0) {
+ delta = random_u64_range(s->runtime_rand_extra_usec);
+ log_unit_debug(UNIT(s), "Adding delta of %s sec to timeout", FORMAT_TIMESPAN(delta, USEC_PER_SEC));
+ }
+
+ return usec_add(usec_add(UNIT(s)->active_enter_timestamp.monotonic,
+ s->runtime_max_usec),
+ delta);
+}
+
+static int service_arm_timer(Service *s, bool relative, usec_t usec) {
+ assert(s);
+
+ return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, service_dispatch_timer);
+}
+
+static int service_verify(Service *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++)
+ LIST_FOREACH(command, command, s->exec_command[c]) {
+ if (!path_is_absolute(command->path) && !filename_is_valid(command->path))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+ "Service %s= binary path \"%s\" is neither a valid executable name nor an absolute path. Refusing.",
+ command->path,
+ service_exec_command_to_string(c));
+ if (strv_isempty(command->argv))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC),
+ "Service has an empty argv in %s=. Refusing.",
+ service_exec_command_to_string(c));
+ }
+
+ if (!s->exec_command[SERVICE_EXEC_START] && !s->exec_command[SERVICE_EXEC_STOP] &&
+ UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+ /* FailureAction= only makes sense if one of the start or stop commands is specified.
+ * SuccessAction= will be executed unconditionally if no commands are specified. Hence,
+ * either a command or SuccessAction= are required. */
+
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart=, ExecStop=, or SuccessAction=. Refusing.");
+
+ if (s->type != SERVICE_ONESHOT && !s->exec_command[SERVICE_EXEC_START])
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+ if (!s->remain_after_exit && !s->exec_command[SERVICE_EXEC_START] && UNIT(s)->success_action == EMERGENCY_ACTION_NONE)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has no ExecStart= and no SuccessAction= settings and does not have RemainAfterExit=yes set. Refusing.");
+
+ if (s->type != SERVICE_ONESHOT && s->exec_command[SERVICE_EXEC_START]->command_next)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT && IN_SET(s->restart, SERVICE_RESTART_ALWAYS, SERVICE_RESTART_ON_SUCCESS))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has Restart= set to either always or on-success, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT && !exit_status_set_is_empty(&s->restart_force_status))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has RestartForceExitStatus= set, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_ONESHOT && s->exit_type == SERVICE_EXIT_CGROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has ExitType=cgroup set, which isn't allowed for Type=oneshot services. Refusing.");
+
+ if (s->type == SERVICE_DBUS && !s->bus_name)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
+
+ if (s->exec_context.pam_name && !IN_SET(s->kill_context.kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service has PAM enabled. Kill mode must be set to 'control-group' or 'mixed'. Refusing.");
+
+ if (s->usb_function_descriptors && !s->usb_function_strings)
+ log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
+
+ if (!s->usb_function_descriptors && s->usb_function_strings)
+ log_unit_warning(UNIT(s), "Service has USBFunctionStrings= setting, but no USBFunctionDescriptors=. Ignoring.");
+
+ if (s->runtime_max_usec != USEC_INFINITY && s->type == SERVICE_ONESHOT)
+ log_unit_warning(UNIT(s), "RuntimeMaxSec= has no effect in combination with Type=oneshot. Ignoring.");
+
+ if (s->runtime_max_usec == USEC_INFINITY && s->runtime_rand_extra_usec != 0)
+ log_unit_warning(UNIT(s), "Service has RuntimeRandomizedExtraSec= setting, but no RuntimeMaxSec=. Ignoring.");
+
+ if (s->exit_type == SERVICE_EXIT_CGROUP && cg_unified() < CGROUP_UNIFIED_SYSTEMD)
+ log_unit_warning(UNIT(s), "Service has ExitType=cgroup set, but we are running with legacy cgroups v1, which might not work correctly. Continuing.");
+
+ if (s->restart_max_delay_usec == USEC_INFINITY && s->restart_steps > 0)
+ log_unit_warning(UNIT(s), "Service has RestartSteps= but no RestartMaxDelaySec= setting. Ignoring.");
+
+ if (s->restart_max_delay_usec != USEC_INFINITY && s->restart_steps == 0)
+ log_unit_warning(UNIT(s), "Service has RestartMaxDelaySec= but no RestartSteps= setting. Ignoring.");
+
+ if (s->restart_max_delay_usec < s->restart_usec) {
+ log_unit_warning(UNIT(s), "RestartMaxDelaySec= has a value smaller than RestartSec=, resetting RestartSec= to RestartMaxDelaySec=.");
+ s->restart_usec = s->restart_max_delay_usec;
+ }
+
+ return 0;
+}
+
+static int service_add_default_dependencies(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Add a number of automatic dependencies useful for the
+ * majority of services. */
+
+ if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+ /* First, pull in the really early boot stuff, and
+ * require it, so that we fail if we can't acquire
+ * it. */
+
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ } else {
+
+ /* In the --user instance there's no sysinit.target,
+ * in that case require basic.target instead. */
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ /* Second, if the rest of the base system is in the same
+ * transaction, order us after it, but do not pull it in or
+ * even require it. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_BASIC_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ /* Third, add us in for normal shutdown. */
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static void service_fix_stdio(Service *s) {
+ assert(s);
+
+ /* Note that EXEC_INPUT_NULL and EXEC_OUTPUT_INHERIT play a special role here: they are both the
+ * default value that is subject to automatic overriding triggered by other settings and an explicit
+ * choice the user can make. We don't distinguish between these cases currently. */
+
+ if (s->exec_context.std_input == EXEC_INPUT_NULL &&
+ s->exec_context.stdin_data_size > 0)
+ s->exec_context.std_input = EXEC_INPUT_DATA;
+
+ if (IN_SET(s->exec_context.std_input,
+ EXEC_INPUT_TTY,
+ EXEC_INPUT_TTY_FORCE,
+ EXEC_INPUT_TTY_FAIL,
+ EXEC_INPUT_SOCKET,
+ EXEC_INPUT_NAMED_FD))
+ return;
+
+ /* We assume these listed inputs refer to bidirectional streams, and hence duplicating them from
+ * stdin to stdout/stderr makes sense and hence leaving EXEC_OUTPUT_INHERIT in place makes sense,
+ * too. Outputs such as regular files or sealed data memfds otoh don't really make sense to be
+ * duplicated for both input and output at the same time (since they then would cause a feedback
+ * loop), hence override EXEC_OUTPUT_INHERIT with the default stderr/stdout setting. */
+
+ if (s->exec_context.std_error == EXEC_OUTPUT_INHERIT &&
+ s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+ s->exec_context.std_error = UNIT(s)->manager->defaults.std_error;
+
+ if (s->exec_context.std_output == EXEC_OUTPUT_INHERIT)
+ s->exec_context.std_output = UNIT(s)->manager->defaults.std_output;
+}
+
+static int service_setup_bus_name(Service *s) {
+ int r;
+
+ assert(s);
+
+ /* If s->bus_name is not set, then the unit will be refused by service_verify() later. */
+ if (!s->bus_name)
+ return 0;
+
+ if (s->type == SERVICE_DBUS) {
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_REQUIRES, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+
+ /* We always want to be ordered against dbus.socket if both are in the transaction. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_DBUS_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to add dependency on " SPECIAL_DBUS_SOCKET ": %m");
+ }
+
+ r = unit_watch_bus_name(UNIT(s), s->bus_name);
+ if (r == -EEXIST)
+ return log_unit_error_errno(UNIT(s), r, "Two services allocated for the same bus name %s, refusing operation.", s->bus_name);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Cannot watch bus name %s: %m", s->bus_name);
+
+ return 0;
+}
+
+static int service_add_extras(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (s->type == _SERVICE_TYPE_INVALID) {
+ /* Figure out a type automatically */
+ if (s->bus_name)
+ s->type = SERVICE_DBUS;
+ else if (s->exec_command[SERVICE_EXEC_START])
+ s->type = SERVICE_SIMPLE;
+ else
+ s->type = SERVICE_ONESHOT;
+ }
+
+ /* Oneshot services have disabled start timeout by default */
+ if (s->type == SERVICE_ONESHOT && !s->start_timeout_defined)
+ s->timeout_start_usec = USEC_INFINITY;
+
+ service_fix_stdio(s);
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ /* If the service needs the notify socket, let's enable it automatically. */
+ if (s->notify_access == NOTIFY_NONE &&
+ (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) || s->watchdog_usec > 0 || s->n_fd_store_max > 0))
+ s->notify_access = NOTIFY_MAIN;
+
+ /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when
+ * delegation is on, in that case it we assume the payload knows better what to do and can process
+ * things in a more focused way. */
+ if (s->oom_policy < 0)
+ s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->defaults.oom_policy;
+
+ /* Let the kernel do the killing if that's requested. */
+ s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
+ r = service_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = service_setup_bus_name(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int service_load(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = service_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return service_verify(s);
+}
+
+static void service_dump_fdstore(Service *s, FILE *f, const char *prefix) {
+ assert(s);
+ assert(f);
+ assert(prefix);
+
+ LIST_FOREACH(fd_store, i, s->fd_store) {
+ _cleanup_free_ char *path = NULL;
+ struct stat st;
+ int flags;
+
+ if (fstat(i->fd, &st) < 0) {
+ log_debug_errno(errno, "Failed to stat fdstore entry: %m");
+ continue;
+ }
+
+ flags = fcntl(i->fd, F_GETFL);
+ if (flags < 0) {
+ log_debug_errno(errno, "Failed to get fdstore entry flags: %m");
+ continue;
+ }
+
+ (void) fd_get_path(i->fd, &path);
+
+ fprintf(f,
+ "%s%s '%s' (type=%s; dev=" DEVNUM_FORMAT_STR "; inode=%" PRIu64 "; rdev=" DEVNUM_FORMAT_STR "; path=%s; access=%s)\n",
+ prefix, i == s->fd_store ? "File Descriptor Store Entry:" : " ",
+ i->fdname,
+ inode_type_to_string(st.st_mode),
+ DEVNUM_FORMAT_VAL(st.st_dev),
+ (uint64_t) st.st_ino,
+ DEVNUM_FORMAT_VAL(st.st_rdev),
+ strna(path),
+ accmode_to_string(flags));
+ }
+}
+
+static void service_dump(Unit *u, FILE *f, const char *prefix) {
+ Service *s = SERVICE(u);
+ const char *prefix2;
+
+ assert(s);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%sService State: %s\n"
+ "%sResult: %s\n"
+ "%sReload Result: %s\n"
+ "%sClean Result: %s\n"
+ "%sPermissionsStartOnly: %s\n"
+ "%sRootDirectoryStartOnly: %s\n"
+ "%sRemainAfterExit: %s\n"
+ "%sGuessMainPID: %s\n"
+ "%sType: %s\n"
+ "%sRestart: %s\n"
+ "%sNotifyAccess: %s\n"
+ "%sNotifyState: %s\n"
+ "%sOOMPolicy: %s\n"
+ "%sReloadSignal: %s\n",
+ prefix, service_state_to_string(s->state),
+ prefix, service_result_to_string(s->result),
+ prefix, service_result_to_string(s->reload_result),
+ prefix, service_result_to_string(s->clean_result),
+ prefix, yes_no(s->permissions_start_only),
+ prefix, yes_no(s->root_directory_start_only),
+ prefix, yes_no(s->remain_after_exit),
+ prefix, yes_no(s->guess_main_pid),
+ prefix, service_type_to_string(s->type),
+ prefix, service_restart_to_string(s->restart),
+ prefix, notify_access_to_string(service_get_notify_access(s)),
+ prefix, notify_state_to_string(s->notify_state),
+ prefix, oom_policy_to_string(s->oom_policy),
+ prefix, signal_to_string(s->reload_signal));
+
+ if (pidref_is_set(&s->control_pid))
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid.pid);
+
+ if (pidref_is_set(&s->main_pid))
+ fprintf(f,
+ "%sMain PID: "PID_FMT"\n"
+ "%sMain PID Known: %s\n"
+ "%sMain PID Alien: %s\n",
+ prefix, s->main_pid.pid,
+ prefix, yes_no(s->main_pid_known),
+ prefix, yes_no(s->main_pid_alien));
+
+ if (s->pid_file)
+ fprintf(f,
+ "%sPIDFile: %s\n",
+ prefix, s->pid_file);
+
+ if (s->bus_name)
+ fprintf(f,
+ "%sBusName: %s\n"
+ "%sBus Name Good: %s\n",
+ prefix, s->bus_name,
+ prefix, yes_no(s->bus_name_good));
+
+ if (UNIT_ISSET(s->accept_socket))
+ fprintf(f,
+ "%sAccept Socket: %s\n",
+ prefix, UNIT_DEREF(s->accept_socket)->id);
+
+ fprintf(f,
+ "%sRestartSec: %s\n"
+ "%sRestartSteps: %u\n"
+ "%sRestartMaxDelaySec: %s\n"
+ "%sTimeoutStartSec: %s\n"
+ "%sTimeoutStopSec: %s\n"
+ "%sTimeoutStartFailureMode: %s\n"
+ "%sTimeoutStopFailureMode: %s\n",
+ prefix, FORMAT_TIMESPAN(s->restart_usec, USEC_PER_SEC),
+ prefix, s->restart_steps,
+ prefix, FORMAT_TIMESPAN(s->restart_max_delay_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->timeout_start_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->timeout_stop_usec, USEC_PER_SEC),
+ prefix, service_timeout_failure_mode_to_string(s->timeout_start_failure_mode),
+ prefix, service_timeout_failure_mode_to_string(s->timeout_stop_failure_mode));
+
+ if (s->timeout_abort_set)
+ fprintf(f,
+ "%sTimeoutAbortSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_abort_usec, USEC_PER_SEC));
+
+ fprintf(f,
+ "%sRuntimeMaxSec: %s\n"
+ "%sRuntimeRandomizedExtraSec: %s\n"
+ "%sWatchdogSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->runtime_max_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->runtime_rand_extra_usec, USEC_PER_SEC),
+ prefix, FORMAT_TIMESPAN(s->watchdog_usec, USEC_PER_SEC));
+
+ kill_context_dump(&s->kill_context, f, prefix);
+ exec_context_dump(&s->exec_context, f, prefix);
+
+ for (ServiceExecCommand c = 0; c < _SERVICE_EXEC_COMMAND_MAX; c++) {
+ if (!s->exec_command[c])
+ continue;
+
+ fprintf(f, "%s-> %s:\n",
+ prefix, service_exec_command_to_string(c));
+
+ exec_command_dump_list(s->exec_command[c], f, prefix2);
+ }
+
+ if (s->status_text)
+ fprintf(f, "%sStatus Text: %s\n",
+ prefix, s->status_text);
+
+ if (s->n_fd_store_max > 0)
+ fprintf(f,
+ "%sFile Descriptor Store Max: %u\n"
+ "%sFile Descriptor Store Pin: %s\n"
+ "%sFile Descriptor Store Current: %zu\n",
+ prefix, s->n_fd_store_max,
+ prefix, exec_preserve_mode_to_string(s->fd_store_preserve_mode),
+ prefix, s->n_fd_store);
+
+ service_dump_fdstore(s, f, prefix);
+
+ if (s->open_files)
+ LIST_FOREACH(open_files, of, s->open_files) {
+ _cleanup_free_ char *ofs = NULL;
+ int r;
+
+ r = open_file_to_string(of, &ofs);
+ if (r < 0) {
+ log_debug_errno(r,
+ "Failed to convert OpenFile= setting to string, ignoring: %m");
+ continue;
+ }
+
+ fprintf(f, "%sOpen File: %s\n", prefix, ofs);
+ }
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int service_is_suitable_main_pid(Service *s, PidRef *pid, int prio) {
+ Unit *owner;
+ int r;
+
+ assert(s);
+ assert(pidref_is_set(pid));
+
+ /* Checks whether the specified PID is suitable as main PID for this service. returns negative if not, 0 if the
+ * PID is questionnable but should be accepted if the source of configuration is trusted. > 0 if the PID is
+ * good */
+
+ if (pidref_is_self(pid) || pid->pid == 1)
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the manager, refusing.", pid->pid);
+
+ if (pidref_equal(pid, &s->control_pid))
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(EPERM), "New main PID "PID_FMT" is the control process, refusing.", pid->pid);
+
+ r = pidref_is_alive(pid);
+ if (r < 0)
+ return log_unit_full_errno(UNIT(s), prio, r, "Failed to check if main PID "PID_FMT" exists or is a zombie: %m", pid->pid);
+ if (r == 0)
+ return log_unit_full_errno(UNIT(s), prio, SYNTHETIC_ERRNO(ESRCH), "New main PID "PID_FMT" does not exist or is a zombie.", pid->pid);
+
+ owner = manager_get_unit_by_pidref(UNIT(s)->manager, pid);
+ if (owner == UNIT(s)) {
+ log_unit_debug(UNIT(s), "New main PID "PID_FMT" belongs to service, we are happy.", pid->pid);
+ return 1; /* Yay, it's definitely a good PID */
+ }
+
+ return 0; /* Hmm it's a suspicious PID, let's accept it if configuration source is trusted */
+}
+
+static int service_load_pid_file(Service *s, bool may_warn) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ bool questionable_pid_file = false;
+ _cleanup_free_ char *k = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r, prio;
+
+ assert(s);
+
+ if (!s->pid_file)
+ return -ENOENT;
+
+ prio = may_warn ? LOG_INFO : LOG_DEBUG;
+
+ r = chase(s->pid_file, NULL, CHASE_SAFE, NULL, &fd);
+ if (r == -ENOLINK) {
+ log_unit_debug_errno(UNIT(s), r,
+ "Potentially unsafe symlink chain, will now retry with relaxed checks: %s", s->pid_file);
+
+ questionable_pid_file = true;
+
+ r = chase(s->pid_file, NULL, 0, NULL, &fd);
+ }
+ if (r < 0)
+ return log_unit_full_errno(UNIT(s), prio, r,
+ "Can't open PID file %s (yet?) after %s: %m", s->pid_file, service_state_to_string(s->state));
+
+ /* Let's read the PID file now that we chased it down. But we need to convert the O_PATH fd
+ * chase() returned us into a proper fd first. */
+ r = read_one_line_file(FORMAT_PROC_FD_PATH(fd), &k);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r,
+ "Can't convert PID files %s O_PATH file descriptor to proper file descriptor: %m",
+ s->pid_file);
+
+ r = pidref_set_pidstr(&pidref, k);
+ if (r < 0)
+ return log_unit_full_errno(UNIT(s), prio, r, "Failed to parse PID from file %s: %m", s->pid_file);
+
+ if (s->main_pid_known && pidref_equal(&pidref, &s->main_pid))
+ return 0;
+
+ r = service_is_suitable_main_pid(s, &pidref, prio);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ struct stat st;
+
+ if (questionable_pid_file)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+ "Refusing to accept PID outside of service control group, acquired through unsafe symlink chain: %s", s->pid_file);
+
+ /* Hmm, it's not clear if the new main PID is safe. Let's allow this if the PID file is owned by root */
+
+ if (fstat(fd, &st) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to fstat() PID file O_PATH fd: %m");
+
+ if (st.st_uid != 0)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EPERM),
+ "New main PID "PID_FMT" does not belong to service, and PID file is not owned by root. Refusing.", pidref.pid);
+
+ log_unit_debug(UNIT(s), "New main PID "PID_FMT" does not belong to service, but we'll accept it since PID file is owned by root.", pidref.pid);
+ }
+
+ if (s->main_pid_known) {
+ log_unit_debug(UNIT(s), "Main PID changing: "PID_FMT" -> "PID_FMT, s->main_pid.pid, pidref.pid);
+
+ service_unwatch_main_pid(s);
+ s->main_pid_known = false;
+ } else
+ log_unit_debug(UNIT(s), "Main PID loaded: "PID_FMT, pidref.pid);
+
+ r = service_set_main_pidref(s, &pidref);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+ if (r < 0) /* FIXME: we need to do something here */
+ return log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" for service: %m", s->main_pid.pid);
+
+ return 1;
+}
+
+static void service_search_main_pid(Service *s) {
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+ int r;
+
+ assert(s);
+
+ /* If we know it anyway, don't ever fall back to unreliable heuristics */
+ if (s->main_pid_known)
+ return;
+
+ if (!s->guess_main_pid)
+ return;
+
+ assert(!pidref_is_set(&s->main_pid));
+
+ if (unit_search_main_pid(UNIT(s), &pid) < 0)
+ return;
+
+ log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid.pid);
+ if (service_set_main_pidref(s, &pid) < 0)
+ return;
+
+ r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+ if (r < 0)
+ /* FIXME: we need to do something here */
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch PID "PID_FMT" from: %m", s->main_pid.pid);
+}
+
+static void service_set_state(Service *s, ServiceState state) {
+ ServiceState old_state;
+ const UnitActiveState *table;
+
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ table = s->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+ old_state = s->state;
+ s->state = state;
+
+ service_unwatch_pid_file(s);
+
+ if (!IN_SET(state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_AUTO_RESTART,
+ SERVICE_CLEANING))
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+
+ if (!IN_SET(state,
+ SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
+ service_unwatch_main_pid(s);
+ s->main_command = NULL;
+ }
+
+ if (!IN_SET(state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_CLEANING)) {
+ service_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+ }
+
+ if (IN_SET(state,
+ SERVICE_DEAD, SERVICE_FAILED,
+ SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED,
+ SERVICE_DEAD_RESOURCES_PINNED)) {
+ unit_unwatch_all_pids(UNIT(s));
+ unit_dequeue_rewatch_pids(UNIT(s));
+ }
+
+ if (state != SERVICE_START)
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+ service_stop_watchdog(s);
+
+ /* For the inactive states unit_notify() will trim the cgroup,
+ * but for exit we have to do that ourselves... */
+ if (state == SERVICE_EXITED && !MANAGER_IS_RELOADING(UNIT(s)->manager))
+ unit_prune_cgroup(UNIT(s));
+
+ if (old_state != state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", service_state_to_string(old_state), service_state_to_string(state));
+
+ unit_notify(UNIT(s), table[old_state], table[state], s->reload_result == SERVICE_SUCCESS);
+}
+
+static usec_t service_coldplug_timeout(Service *s) {
+ assert(s);
+
+ switch (s->deserialized_state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ case SERVICE_RELOAD:
+ case SERVICE_RELOAD_SIGNAL:
+ case SERVICE_RELOAD_NOTIFY:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_start_usec);
+
+ case SERVICE_RUNNING:
+ return service_running_timeout(s);
+
+ case SERVICE_STOP:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_FINAL_WATCHDOG:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s));
+
+ case SERVICE_AUTO_RESTART:
+ return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, service_restart_usec_next(s));
+
+ case SERVICE_CLEANING:
+ return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->exec_context.timeout_clean_usec);
+
+ default:
+ return USEC_INFINITY;
+ }
+}
+
+static int service_coldplug(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SERVICE_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ r = service_arm_timer(s, /* relative= */ false, service_coldplug_timeout(s));
+ if (r < 0)
+ return r;
+
+ if (pidref_is_set(&s->main_pid) &&
+ pidref_is_unwaited(&s->main_pid) > 0 &&
+ (IN_SET(s->deserialized_state,
+ SERVICE_START, SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
+ r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+ if (r < 0)
+ return r;
+ }
+
+ if (pidref_is_set(&s->control_pid) &&
+ pidref_is_unwaited(&s->control_pid) > 0 &&
+ IN_SET(s->deserialized_state,
+ SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
+ SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_CLEANING)) {
+ r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(s->deserialized_state,
+ SERVICE_DEAD, SERVICE_FAILED,
+ SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED,
+ SERVICE_CLEANING,
+ SERVICE_DEAD_RESOURCES_PINNED)) {
+ (void) unit_enqueue_rewatch_pids(u);
+ (void) unit_setup_exec_runtime(u);
+ }
+
+ if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+ service_start_watchdog(s);
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket));
+
+ if (socket->max_connections_per_source > 0) {
+ SocketPeer *peer;
+
+ /* Make a best-effort attempt at bumping the connection count */
+ if (socket_acquire_peer(socket, s->socket_fd, &peer) > 0) {
+ socket_peer_unref(s->socket_peer);
+ s->socket_peer = peer;
+ }
+ }
+ }
+
+ service_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static int service_collect_fds(
+ Service *s,
+ int **fds,
+ char ***fd_names,
+ size_t *n_socket_fds,
+ size_t *n_storage_fds) {
+
+ _cleanup_strv_free_ char **rfd_names = NULL;
+ _cleanup_free_ int *rfds = NULL;
+ size_t rn_socket_fds = 0, rn_storage_fds = 0;
+ int r;
+
+ assert(s);
+ assert(fds);
+ assert(fd_names);
+ assert(n_socket_fds);
+ assert(n_storage_fds);
+
+ if (s->socket_fd >= 0) {
+
+ /* Pass the per-connection socket */
+
+ rfds = newdup(int, &s->socket_fd, 1);
+ if (!rfds)
+ return -ENOMEM;
+
+ rfd_names = strv_new("connection");
+ if (!rfd_names)
+ return -ENOMEM;
+
+ rn_socket_fds = 1;
+ } else {
+ Unit *u;
+
+ /* Pass all our configured sockets for singleton services */
+
+ UNIT_FOREACH_DEPENDENCY(u, UNIT(s), UNIT_ATOM_TRIGGERED_BY) {
+ _cleanup_free_ int *cfds = NULL;
+ Socket *sock;
+ int cn_fds;
+
+ if (u->type != UNIT_SOCKET)
+ continue;
+
+ sock = SOCKET(u);
+
+ cn_fds = socket_collect_fds(sock, &cfds);
+ if (cn_fds < 0)
+ return cn_fds;
+
+ if (cn_fds <= 0)
+ continue;
+
+ if (!rfds) {
+ rfds = TAKE_PTR(cfds);
+ rn_socket_fds = cn_fds;
+ } else {
+ int *t;
+
+ t = reallocarray(rfds, rn_socket_fds + cn_fds, sizeof(int));
+ if (!t)
+ return -ENOMEM;
+
+ memcpy(t + rn_socket_fds, cfds, cn_fds * sizeof(int));
+
+ rfds = t;
+ rn_socket_fds += cn_fds;
+ }
+
+ r = strv_extend_n(&rfd_names, socket_fdname(sock), cn_fds);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (s->n_fd_store > 0) {
+ size_t n_fds;
+ char **nl;
+ int *t;
+
+ t = reallocarray(rfds, rn_socket_fds + s->n_fd_store, sizeof(int));
+ if (!t)
+ return -ENOMEM;
+
+ rfds = t;
+
+ nl = reallocarray(rfd_names, rn_socket_fds + s->n_fd_store + 1, sizeof(char *));
+ if (!nl)
+ return -ENOMEM;
+
+ rfd_names = nl;
+ n_fds = rn_socket_fds;
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ rfds[n_fds] = fs->fd;
+ rfd_names[n_fds] = strdup(strempty(fs->fdname));
+ if (!rfd_names[n_fds])
+ return -ENOMEM;
+
+ rn_storage_fds++;
+ n_fds++;
+ }
+
+ rfd_names[n_fds] = NULL;
+ }
+
+ *fds = TAKE_PTR(rfds);
+ *fd_names = TAKE_PTR(rfd_names);
+ *n_socket_fds = rn_socket_fds;
+ *n_storage_fds = rn_storage_fds;
+
+ return 0;
+}
+
+static int service_allocate_exec_fd_event_source(
+ Service *s,
+ int fd,
+ sd_event_source **ret_event_source) {
+
+ _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+ assert(ret_event_source);
+
+ r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m");
+
+ /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */
+
+ r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m");
+
+ (void) sd_event_source_set_description(source, "service exec_fd");
+
+ r = sd_event_source_set_io_fd_own(source, true);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m");
+
+ *ret_event_source = TAKE_PTR(source);
+ return 0;
+}
+
+static int service_allocate_exec_fd(
+ Service *s,
+ sd_event_source **ret_event_source,
+ int *ret_exec_fd) {
+
+ _cleanup_close_pair_ int p[] = EBADF_PAIR;
+ int r;
+
+ assert(s);
+ assert(ret_event_source);
+ assert(ret_exec_fd);
+
+ if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m");
+
+ r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source);
+ if (r < 0)
+ return r;
+
+ TAKE_FD(p[0]);
+ *ret_exec_fd = TAKE_FD(p[1]);
+
+ return 0;
+}
+
+static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) {
+ assert(s);
+
+ /* Notifications are accepted depending on the process and
+ * the access setting of the service:
+ * process: \ access: NONE MAIN EXEC ALL
+ * main no yes yes yes
+ * control no no yes yes
+ * other (forked) no no no yes */
+
+ if (flags & EXEC_IS_CONTROL)
+ /* A control process */
+ return IN_SET(service_get_notify_access(s), NOTIFY_EXEC, NOTIFY_ALL);
+
+ /* We only spawn main processes and control processes, so any
+ * process that is not a control process is a main process */
+ return service_get_notify_access(s) != NOTIFY_NONE;
+}
+
+static Service *service_get_triggering_service(Service *s) {
+ Unit *candidate = NULL, *other;
+
+ assert(s);
+
+ /* Return the service which triggered service 's', this means dependency
+ * types which include the UNIT_ATOM_ON_{FAILURE,SUCCESS}_OF atoms.
+ *
+ * N.B. if there are multiple services which could trigger 's' via OnFailure=
+ * or OnSuccess= then we return NULL. This is since we don't know from which
+ * one to propagate the exit status. */
+
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_FAILURE_OF) {
+ if (candidate)
+ goto have_other;
+ candidate = other;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_ON_SUCCESS_OF) {
+ if (candidate)
+ goto have_other;
+ candidate = other;
+ }
+
+ return SERVICE(candidate);
+
+ have_other:
+ log_unit_warning(UNIT(s), "multiple trigger source candidates for exit status propagation (%s, %s), skipping.",
+ candidate->id, other->id);
+ return NULL;
+}
+
+static int service_spawn_internal(
+ const char *caller,
+ Service *s,
+ ExecCommand *c,
+ usec_t timeout,
+ ExecFlags flags,
+ PidRef *ret_pid) {
+
+ _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags);
+ _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
+ _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ size_t n_env = 0;
+ pid_t pid;
+ int r;
+
+ assert(caller);
+ assert(s);
+ assert(c);
+ assert(ret_pid);
+
+ log_unit_debug(UNIT(s), "Will spawn child (%s): %s", caller, c->path);
+
+ r = unit_prepare_exec(UNIT(s)); /* This realizes the cgroup, among other things */
+ if (r < 0)
+ return r;
+
+ assert(!s->exec_fd_event_source);
+
+ if (flags & EXEC_IS_CONTROL) {
+ /* If this is a control process, mask the permissions/chroot application if this is requested. */
+ if (s->permissions_start_only)
+ exec_params.flags &= ~EXEC_APPLY_SANDBOXING;
+ if (s->root_directory_start_only)
+ exec_params.flags &= ~EXEC_APPLY_CHROOT;
+ }
+
+ if ((flags & EXEC_PASS_FDS) ||
+ s->exec_context.std_input == EXEC_INPUT_SOCKET ||
+ s->exec_context.std_output == EXEC_OUTPUT_SOCKET ||
+ s->exec_context.std_error == EXEC_OUTPUT_SOCKET) {
+
+ r = service_collect_fds(s,
+ &exec_params.fds,
+ &exec_params.fd_names,
+ &exec_params.n_socket_fds,
+ &exec_params.n_storage_fds);
+ if (r < 0)
+ return r;
+
+ exec_params.open_files = s->open_files;
+
+ log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
+ }
+
+ if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
+ r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
+ if (r < 0)
+ return r;
+ }
+
+ r = service_arm_timer(s, /* relative= */ true, timeout);
+ if (r < 0)
+ return r;
+
+ our_env = new0(char*, 13);
+ if (!our_env)
+ return -ENOMEM;
+
+ if (service_exec_needs_notify_socket(s, flags)) {
+ if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0)
+ return -ENOMEM;
+
+ exec_params.notify_socket = UNIT(s)->manager->notify_socket;
+
+ if (s->n_fd_store_max > 0)
+ if (asprintf(our_env + n_env++, "FDSTORE=%u", s->n_fd_store_max) < 0)
+ return -ENOMEM;
+ }
+
+ if (pidref_is_set(&s->main_pid))
+ if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid.pid) < 0)
+ return -ENOMEM;
+
+ if (MANAGER_IS_USER(UNIT(s)->manager))
+ if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid_cached()) < 0)
+ return -ENOMEM;
+
+ if (s->pid_file)
+ if (asprintf(our_env + n_env++, "PIDFILE=%s", s->pid_file) < 0)
+ return -ENOMEM;
+
+ if (s->socket_fd >= 0) {
+ union sockaddr_union sa;
+ socklen_t salen = sizeof(sa);
+
+ /* If this is a per-connection service instance, let's set $REMOTE_ADDR and $REMOTE_PORT to something
+ * useful. Note that we do this only when we are still connected at this point in time, which we might
+ * very well not be. Hence we ignore all errors when retrieving peer information (as that might result
+ * in ENOTCONN), and just use whate we can use. */
+
+ if (getpeername(s->socket_fd, &sa.sa, &salen) >= 0 &&
+ IN_SET(sa.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+ _cleanup_free_ char *addr = NULL;
+ char *t;
+ unsigned port;
+
+ r = sockaddr_pretty(&sa.sa, salen, true, false, &addr);
+ if (r < 0)
+ return r;
+
+ t = strjoin("REMOTE_ADDR=", addr);
+ if (!t)
+ return -ENOMEM;
+ our_env[n_env++] = t;
+
+ r = sockaddr_port(&sa.sa, &port);
+ if (r < 0)
+ return r;
+
+ if (asprintf(&t, "REMOTE_PORT=%u", port) < 0)
+ return -ENOMEM;
+ our_env[n_env++] = t;
+ }
+ }
+
+ Service *env_source = NULL;
+ const char *monitor_prefix;
+ if (flags & EXEC_SETENV_RESULT) {
+ env_source = s;
+ monitor_prefix = "";
+ } else if (flags & EXEC_SETENV_MONITOR_RESULT) {
+ env_source = service_get_triggering_service(s);
+ monitor_prefix = "MONITOR_";
+ }
+
+ if (env_source) {
+ if (asprintf(our_env + n_env++, "%sSERVICE_RESULT=%s", monitor_prefix, service_result_to_string(env_source->result)) < 0)
+ return -ENOMEM;
+
+ if (env_source->main_exec_status.pid > 0 &&
+ dual_timestamp_is_set(&env_source->main_exec_status.exit_timestamp)) {
+ if (asprintf(our_env + n_env++, "%sEXIT_CODE=%s", monitor_prefix, sigchld_code_to_string(env_source->main_exec_status.code)) < 0)
+ return -ENOMEM;
+
+ if (env_source->main_exec_status.code == CLD_EXITED)
+ r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%i", monitor_prefix, env_source->main_exec_status.status);
+ else
+ r = asprintf(our_env + n_env++, "%sEXIT_STATUS=%s", monitor_prefix, signal_to_string(env_source->main_exec_status.status));
+
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ if (env_source != s) {
+ if (!sd_id128_is_null(UNIT(env_source)->invocation_id)) {
+ r = asprintf(our_env + n_env++, "%sINVOCATION_ID=" SD_ID128_FORMAT_STR,
+ monitor_prefix, SD_ID128_FORMAT_VAL(UNIT(env_source)->invocation_id));
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ if (asprintf(our_env + n_env++, "%sUNIT=%s", monitor_prefix, UNIT(env_source)->id) < 0)
+ return -ENOMEM;
+ }
+ }
+
+ if (UNIT(s)->activation_details) {
+ r = activation_details_append_env(UNIT(s)->activation_details, &our_env);
+ if (r < 0)
+ return r;
+ /* The number of env vars added here can vary, rather than keeping the allocation block in
+ * sync manually, these functions simply use the strv methods to append to it, so we need
+ * to update n_env when we are done in case of future usage. */
+ n_env += r;
+ }
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ return r;
+
+ final_env = strv_env_merge(exec_params.environment, our_env);
+ if (!final_env)
+ return -ENOMEM;
+
+ /* System D-Bus needs nss-systemd disabled, so that we don't deadlock */
+ SET_FLAG(exec_params.flags, EXEC_NSS_DYNAMIC_BYPASS,
+ MANAGER_IS_SYSTEM(UNIT(s)->manager) && unit_has_name(UNIT(s), SPECIAL_DBUS_SERVICE));
+
+ strv_free_and_replace(exec_params.environment, final_env);
+ exec_params.watchdog_usec = service_get_watchdog_usec(s);
+ exec_params.selinux_context_net = s->socket_fd_selinux_context_net;
+ if (s->type == SERVICE_IDLE)
+ exec_params.idle_pipe = UNIT(s)->manager->idle_pipe;
+ exec_params.stdin_fd = s->stdin_fd;
+ exec_params.stdout_fd = s->stdout_fd;
+ exec_params.stderr_fd = s->stderr_fd;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->cgroup_context,
+ &pid);
+ if (r < 0)
+ return r;
+
+ s->exec_fd_event_source = TAKE_PTR(exec_fd_source);
+ s->exec_fd_hot = false;
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pidref);
+ return 0;
+}
+
+static int main_pid_good(Service *s) {
+ assert(s);
+
+ /* Returns 0 if the pid is dead, > 0 if it is good, < 0 if we don't know */
+
+ /* If we know the pid file, then let's just check if it is still valid */
+ if (s->main_pid_known) {
+
+ /* If it's an alien child let's check if it is still alive ... */
+ if (s->main_pid_alien && pidref_is_set(&s->main_pid))
+ return pidref_is_alive(&s->main_pid);
+
+ /* .. otherwise assume we'll get a SIGCHLD for it, which we really should wait for to collect
+ * exit status and code */
+ return pidref_is_set(&s->main_pid);
+ }
+
+ /* We don't know the pid */
+ return -EAGAIN;
+}
+
+static int control_pid_good(Service *s) {
+ assert(s);
+
+ /* Returns 0 if the control PID is dead, > 0 if it is good. We never actually return < 0 here, but in order to
+ * make this function as similar as possible to main_pid_good() and cgroup_good(), we pretend that < 0 also
+ * means: we can't figure it out. */
+
+ return pidref_is_set(&s->control_pid);
+}
+
+static int cgroup_good(Service *s) {
+ int r;
+
+ assert(s);
+
+ /* Returns 0 if the cgroup is empty or doesn't exist, > 0 if it is exists and is populated, < 0 if we can't
+ * figure it out */
+
+ if (!UNIT(s)->cgroup_path)
+ return 0;
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path);
+ if (r < 0)
+ return r;
+
+ return r == 0;
+}
+
+static bool service_shall_restart(Service *s, const char **reason) {
+ assert(s);
+
+ /* Don't restart after manual stops */
+ if (s->forbid_restart) {
+ *reason = "manual stop";
+ return false;
+ }
+
+ /* Never restart if this is configured as special exception */
+ if (exit_status_set_test(&s->restart_prevent_status, s->main_exec_status.code, s->main_exec_status.status)) {
+ *reason = "prevented by exit status";
+ return false;
+ }
+
+ /* Restart if the exit code/status are configured as restart triggers */
+ if (exit_status_set_test(&s->restart_force_status, s->main_exec_status.code, s->main_exec_status.status)) {
+ *reason = "forced by exit status";
+ return true;
+ }
+
+ *reason = "restart setting";
+ switch (s->restart) {
+
+ case SERVICE_RESTART_NO:
+ return false;
+
+ case SERVICE_RESTART_ALWAYS:
+ return s->result != SERVICE_SKIP_CONDITION;
+
+ case SERVICE_RESTART_ON_SUCCESS:
+ return s->result == SERVICE_SUCCESS;
+
+ case SERVICE_RESTART_ON_FAILURE:
+ return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_SKIP_CONDITION);
+
+ case SERVICE_RESTART_ON_ABNORMAL:
+ return !IN_SET(s->result, SERVICE_SUCCESS, SERVICE_FAILURE_EXIT_CODE, SERVICE_SKIP_CONDITION);
+
+ case SERVICE_RESTART_ON_WATCHDOG:
+ return s->result == SERVICE_FAILURE_WATCHDOG;
+
+ case SERVICE_RESTART_ON_ABORT:
+ return IN_SET(s->result, SERVICE_FAILURE_SIGNAL, SERVICE_FAILURE_CORE_DUMP);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static bool service_will_restart(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ if (IN_SET(s->state, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART, SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+ return true;
+
+ return unit_will_restart_default(u);
+}
+
+static ServiceState service_determine_dead_state(Service *s) {
+ assert(s);
+
+ return s->fd_store && s->fd_store_preserve_mode == EXEC_PRESERVE_YES ? SERVICE_DEAD_RESOURCES_PINNED : SERVICE_DEAD;
+}
+
+static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) {
+ ServiceState end_state, restart_state;
+ int r;
+
+ assert(s);
+
+ /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not
+ * undo what has already been enqueued. */
+ if (unit_stop_pending(UNIT(s)))
+ allow_restart = false;
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->result == SERVICE_SUCCESS) {
+ unit_log_success(UNIT(s));
+ end_state = service_determine_dead_state(s);
+ restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART;
+ } else if (s->result == SERVICE_SKIP_CONDITION) {
+ unit_log_skip(UNIT(s), service_result_to_string(s->result));
+ end_state = service_determine_dead_state(s);
+ restart_state = SERVICE_DEAD_BEFORE_AUTO_RESTART;
+ } else {
+ unit_log_failure(UNIT(s), service_result_to_string(s->result));
+ end_state = SERVICE_FAILED;
+ restart_state = SERVICE_FAILED_BEFORE_AUTO_RESTART;
+ }
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+ if (!allow_restart)
+ log_unit_debug(UNIT(s), "Service restart not allowed.");
+ else {
+ const char *reason;
+
+ allow_restart = service_shall_restart(s, &reason);
+ log_unit_debug(UNIT(s), "Service will %srestart (%s)",
+ allow_restart ? "" : "not ",
+ reason);
+ }
+
+ if (allow_restart) {
+ usec_t restart_usec_next;
+
+ /* We make two state changes here: one that maps to the high-level UNIT_INACTIVE/UNIT_FAILED
+ * state (i.e. a state indicating deactivation), and then one that that maps to the
+ * high-level UNIT_STARTING state (i.e. a state indicating activation). We do this so that
+ * external software can watch the state changes and see all service failures, even if they
+ * are only transitionary and followed by an automatic restart. We have fine-grained
+ * low-level states for this though so that software can distinguish the permanent UNIT_INACTIVE
+ * state from this transitionary UNIT_INACTIVE state by looking at the low-level states. */
+ if (s->restart_mode != SERVICE_RESTART_MODE_DIRECT)
+ service_set_state(s, restart_state);
+
+ restart_usec_next = service_restart_usec_next(s);
+
+ r = service_arm_timer(s, /* relative= */ true, restart_usec_next);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install restart timer: %m");
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
+ return;
+ }
+
+ log_unit_debug(UNIT(s), "Next restart interval calculated as: %s", FORMAT_TIMESPAN(restart_usec_next, 0));
+
+ service_set_state(s, SERVICE_AUTO_RESTART);
+ } else {
+ service_set_state(s, end_state);
+
+ /* If we shan't restart, then flush out the restart counter. But don't do that immediately, so that the
+ * user can still introspect the counter. Do so on the next start. */
+ s->flush_n_restarts = true;
+ }
+
+ /* The new state is in effect, let's decrease the fd store ref counter again. Let's also re-add us to the GC
+ * queue, so that the fd store is possibly gc'ed again */
+ unit_add_to_gc_queue(UNIT(s));
+
+ /* The next restart might not be a manual stop, hence reset the flag indicating manual stops */
+ s->forbid_restart = false;
+
+ /* Reset NotifyAccess override */
+ s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+ /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
+ s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+ /* Also, remove the runtime directory */
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ /* Also get rid of the fd store, if that's configured. */
+ if (s->fd_store_preserve_mode == EXEC_PRESERVE_NO)
+ service_release_fd_store(s);
+
+ /* Get rid of the IPC bits of the user */
+ unit_unref_uid_gid(UNIT(s), true);
+
+ /* Try to delete the pid file. At this point it will be
+ * out-of-date, and some software might be confused by it, so
+ * let's remove it. */
+ if (s->pid_file)
+ (void) unlink(s->pid_file);
+
+ /* Reset TTY ownership if necessary */
+ exec_context_revert_tty(&s->exec_context);
+}
+
+static void service_enter_stop_post(Service *s, ServiceResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_STOP_POST;
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_stop_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m");
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+
+ service_set_state(s, SERVICE_STOP_POST);
+ } else
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_SUCCESS);
+}
+
+static int state_to_kill_operation(Service *s, ServiceState state) {
+ switch (state) {
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_FINAL_WATCHDOG:
+ return KILL_WATCHDOG;
+
+ case SERVICE_STOP_SIGTERM:
+ if (unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+ _fallthrough_;
+
+ case SERVICE_FINAL_SIGTERM:
+ return KILL_TERMINATE;
+
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_FINAL_SIGKILL:
+ return KILL_KILL;
+
+ default:
+ return _KILL_OPERATION_INVALID;
+ }
+}
+
+static void service_enter_signal(Service *s, ServiceState state, ServiceResult f) {
+ int kill_operation, r;
+
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ /* Before sending any signal, make sure we track all members of this cgroup */
+ (void) unit_watch_all_pids(UNIT(s));
+
+ /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
+ * died now */
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ kill_operation = state_to_kill_operation(s, state);
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ kill_operation,
+ &s->main_pid,
+ &s->control_pid,
+ s->main_pid_alien);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+ goto fail;
+ }
+
+ if (r > 0) {
+ r = service_arm_timer(s, /* relative= */ true,
+ kill_operation == KILL_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ service_set_state(s, state);
+ } else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM) && s->kill_context.send_sigkill)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
+ else if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+ service_enter_stop_post(s, SERVICE_SUCCESS);
+ else if (IN_SET(state, SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM) && s->kill_context.send_sigkill)
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+ else
+ service_enter_dead(s, SERVICE_SUCCESS, /* allow_restart= */ true);
+
+ return;
+
+fail:
+ if (IN_SET(state, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL))
+ service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
+ else
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_stop_by_notify(Service *s) {
+ int r;
+
+ assert(s);
+
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ r = service_arm_timer(s, /* relative= */ true, s->timeout_stop_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+
+ /* The service told us it's stopping, so it's as if we SIGTERM'd it. */
+ service_set_state(s, SERVICE_STOP_SIGTERM);
+}
+
+static void service_enter_stop(Service *s, ServiceResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+ (void) unit_enqueue_rewatch_pids(UNIT(s));
+
+ s->control_command = s->exec_command[SERVICE_EXEC_STOP];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_STOP;
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_stop_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_SETENV_RESULT|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop' task: %m");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+
+ service_set_state(s, SERVICE_STOP);
+ } else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+}
+
+static bool service_good(Service *s) {
+ int main_pid_ok;
+ assert(s);
+
+ if (s->type == SERVICE_DBUS && !s->bus_name_good)
+ return false;
+
+ main_pid_ok = main_pid_good(s);
+ if (main_pid_ok > 0) /* It's alive */
+ return true;
+ if (main_pid_ok == 0 && s->exit_type == SERVICE_EXIT_MAIN) /* It's dead */
+ return false;
+
+ /* OK, we don't know anything about the main PID, maybe
+ * because there is none. Let's check the control group
+ * instead. */
+
+ return cgroup_good(s) != 0;
+}
+
+static void service_enter_running(Service *s, ServiceResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ service_unwatch_control_pid(s);
+
+ if (s->result != SERVICE_SUCCESS)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ else if (service_good(s)) {
+
+ /* If there are any queued up sd_notify() notifications, process them now */
+ if (s->notify_state == NOTIFY_RELOADING)
+ service_enter_reload_by_notify(s);
+ else if (s->notify_state == NOTIFY_STOPPING)
+ service_enter_stop_by_notify(s);
+ else {
+ service_set_state(s, SERVICE_RUNNING);
+
+ r = service_arm_timer(s, /* relative= */ false, service_running_timeout(s));
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ service_enter_running(s, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+ }
+
+ } else if (s->remain_after_exit)
+ service_set_state(s, SERVICE_EXITED);
+ else
+ service_enter_stop(s, SERVICE_SUCCESS);
+}
+
+static void service_enter_start_post(Service *s) {
+ int r;
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ service_reset_watchdog(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_START_POST];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_START_POST;
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m");
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+
+ service_set_state(s, SERVICE_START_POST);
+ } else
+ service_enter_running(s, SERVICE_SUCCESS);
+}
+
+static void service_kill_control_process(Service *s) {
+ int r;
+
+ assert(s);
+
+ if (!pidref_is_set(&s->control_pid))
+ return;
+
+ r = pidref_kill_and_sigcont(&s->control_pid, SIGKILL);
+ if (r < 0) {
+ _cleanup_free_ char *comm = NULL;
+
+ (void) pidref_get_comm(&s->control_pid, &comm);
+
+ log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m",
+ s->control_pid.pid, strna(comm));
+ }
+}
+
+static int service_adverse_to_leftover_processes(Service *s) {
+ assert(s);
+
+ /* KillMode=mixed and control group are used to indicate that all process should be killed off.
+ * SendSIGKILL= is used for services that require a clean shutdown. These are typically database
+ * service where a SigKilled process would result in a lengthy recovery and who's shutdown or startup
+ * time is quite variable (so Timeout settings aren't of use).
+ *
+ * Here we take these two factors and refuse to start a service if there are existing processes
+ * within a control group. Databases, while generally having some protection against multiple
+ * instances running, lets not stress the rigor of these. Also ExecStartPre= parts of the service
+ * aren't as rigoriously written to protect aganst against multiple use. */
+
+ if (unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start) > 0 &&
+ IN_SET(s->kill_context.kill_mode, KILL_MIXED, KILL_CONTROL_GROUP) &&
+ !s->kill_context.send_sigkill)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(EBUSY),
+ "Will not start SendSIGKILL=no service of type KillMode=control-group or mixed while processes exist");
+
+ return 0;
+}
+
+static void service_enter_start(Service *s) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ ExecCommand *c;
+ usec_t timeout;
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ service_unwatch_main_pid(s);
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ if (s->type == SERVICE_FORKING) {
+ s->control_command_id = SERVICE_EXEC_START;
+ c = s->control_command = s->exec_command[SERVICE_EXEC_START];
+
+ s->main_command = NULL;
+ } else {
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+ s->control_command = NULL;
+
+ c = s->main_command = s->exec_command[SERVICE_EXEC_START];
+ }
+
+ if (!c) {
+ if (s->type != SERVICE_ONESHOT) {
+ /* There's no command line configured for the main command? Hmm, that is strange.
+ * This can only happen if the configuration changes at runtime. In this case,
+ * let's enter a failure state. */
+ r = log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENXIO), "There's no 'start' task anymore we could start.");
+ goto fail;
+ }
+
+ /* We force a fake state transition here. Otherwise, the unit would go directly from
+ * SERVICE_DEAD to SERVICE_DEAD without SERVICE_ACTIVATING or SERVICE_ACTIVE
+ * in between. This way we can later trigger actions that depend on the state
+ * transition, including SuccessAction=. */
+ service_set_state(s, SERVICE_START);
+
+ service_enter_start_post(s);
+ return;
+ }
+
+ if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE))
+ /* For simple + idle this is the main process. We don't apply any timeout here, but
+ * service_enter_running() will later apply the .runtime_max_usec timeout. */
+ timeout = USEC_INFINITY;
+ else
+ timeout = s->timeout_start_usec;
+
+ r = service_spawn(s,
+ c,
+ timeout,
+ EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_WRITE_CREDENTIALS|EXEC_SETENV_MONITOR_RESULT,
+ &pidref);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start' task: %m");
+ goto fail;
+ }
+
+ if (IN_SET(s->type, SERVICE_SIMPLE, SERVICE_IDLE)) {
+ /* For simple services we immediately start
+ * the START_POST binaries. */
+
+ (void) service_set_main_pidref(s, &pidref);
+ service_enter_start_post(s);
+
+ } else if (s->type == SERVICE_FORKING) {
+
+ /* For forking services we wait until the start
+ * process exited. */
+
+ pidref_done(&s->control_pid);
+ s->control_pid = TAKE_PIDREF(pidref);
+ service_set_state(s, SERVICE_START);
+
+ } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC)) {
+
+ /* For oneshot services we wait until the start process exited, too, but it is our main process. */
+
+ /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the
+ * bus. 'notify' and 'exec' services are similar. */
+
+ (void) service_set_main_pidref(s, &pidref);
+ service_set_state(s, SERVICE_START);
+ } else
+ assert_not_reached();
+
+ return;
+
+fail:
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+}
+
+static void service_enter_start_pre(Service *s) {
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_START_PRE];
+ if (s->control_command) {
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ s->control_command_id = SERVICE_EXEC_START_PRE;
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m");
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_START_PRE);
+ } else
+ service_enter_start(s);
+
+ return;
+
+fail:
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_condition(Service *s) {
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+
+ s->control_command = s->exec_command[SERVICE_EXEC_CONDITION];
+ if (s->control_command) {
+
+ r = service_adverse_to_leftover_processes(s);
+ if (r < 0)
+ goto fail;
+
+ s->control_command_id = SERVICE_EXEC_CONDITION;
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN,
+ &s->control_pid);
+
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'exec-condition' task: %m");
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_CONDITION);
+ } else
+ service_enter_start_pre(s);
+
+ return;
+
+fail:
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+}
+
+static void service_enter_restart(Service *s) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ if (unit_has_job_type(UNIT(s), JOB_STOP)) {
+ /* Don't restart things if we are going down anyway */
+ log_unit_info(UNIT(s), "Stop job pending for unit, skipping automatic restart.");
+ return;
+ }
+
+ /* Any units that are bound to this service must also be restarted. We use JOB_START for ourselves
+ * but then set JOB_RESTART_DEPENDENCIES which will enqueue JOB_RESTART for those dependency jobs. */
+ r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT(s), JOB_RESTART_DEPENDENCIES, NULL, &error, NULL);
+ if (r < 0) {
+ log_unit_warning(UNIT(s), "Failed to schedule restart job: %s", bus_error_message(&error, r));
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ false);
+ return;
+ }
+
+ /* Count the jobs we enqueue for restarting. This counter is maintained as long as the unit isn't
+ * fully stopped, i.e. as long as it remains up or remains in auto-start states. The user can reset
+ * the counter explicitly however via the usual "systemctl reset-failure" logic. */
+ s->n_restarts ++;
+ s->flush_n_restarts = false;
+
+ s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+
+ log_unit_struct(UNIT(s), LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR,
+ LOG_UNIT_INVOCATION_ID(UNIT(s)),
+ LOG_UNIT_MESSAGE(UNIT(s),
+ "Scheduled restart job, restart counter is at %u.", s->n_restarts),
+ "N_RESTARTS=%u", s->n_restarts);
+
+ service_set_state(s, SERVICE_AUTO_RESTART_QUEUED);
+
+ /* Notify clients about changed restart counter */
+ unit_add_to_dbus_queue(UNIT(s));
+}
+
+static void service_enter_reload_by_notify(Service *s) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ s->reload_result = SERVICE_FAILURE_RESOURCES;
+ service_enter_running(s, SERVICE_SUCCESS);
+ return;
+ }
+
+ service_set_state(s, SERVICE_RELOAD_NOTIFY);
+
+ /* service_enter_reload_by_notify is never called during a reload, thus no loops are possible. */
+ r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error);
+ if (r < 0)
+ log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r));
+}
+
+static void service_enter_reload(Service *s) {
+ bool killed = false;
+ int r;
+
+ assert(s);
+
+ service_unwatch_control_pid(s);
+ s->reload_result = SERVICE_SUCCESS;
+
+ usec_t ts = now(CLOCK_MONOTONIC);
+
+ if (s->type == SERVICE_NOTIFY_RELOAD && pidref_is_set(&s->main_pid)) {
+ r = pidref_kill_and_sigcont(&s->main_pid, s->reload_signal);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to send reload signal: %m");
+ goto fail;
+ }
+
+ killed = true;
+ }
+
+ s->control_command = s->exec_command[SERVICE_EXEC_RELOAD];
+ if (s->control_command) {
+ s->control_command_id = SERVICE_EXEC_RELOAD;
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ s->timeout_start_usec,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|EXEC_CONTROL_CGROUP,
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'reload' task: %m");
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_RELOAD);
+ } else if (killed) {
+ r = service_arm_timer(s, /* relative= */ true, s->timeout_start_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_RELOAD_SIGNAL);
+ } else {
+ service_enter_running(s, SERVICE_SUCCESS);
+ return;
+ }
+
+ /* Store the timestamp when we started reloading: when reloading via SIGHUP we won't leave the reload
+ * state until we received both RELOADING=1 and READY=1 with MONOTONIC_USEC= set to a value above
+ * this. Thus we know for sure the reload cycle was executed *after* we requested it, and is not one
+ * that was already in progress before. */
+ s->reload_begin_usec = ts;
+ return;
+
+fail:
+ s->reload_result = SERVICE_FAILURE_RESOURCES;
+ service_enter_running(s, SERVICE_SUCCESS);
+}
+
+static void service_run_next_control(Service *s) {
+ usec_t timeout;
+ int r;
+
+ assert(s);
+ assert(s->control_command);
+ assert(s->control_command->command_next);
+
+ assert(s->control_command_id != SERVICE_EXEC_START);
+
+ s->control_command = s->control_command->command_next;
+ service_unwatch_control_pid(s);
+
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
+ timeout = s->timeout_start_usec;
+ else
+ timeout = s->timeout_stop_usec;
+
+ pidref_done(&s->control_pid);
+
+ r = service_spawn(s,
+ s->control_command,
+ timeout,
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_IS_CONTROL|
+ (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD) ? EXEC_WRITE_CREDENTIALS : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_SETENV_RESULT : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START) ? EXEC_SETENV_MONITOR_RESULT : 0)|
+ (IN_SET(s->control_command_id, SERVICE_EXEC_START_POST, SERVICE_EXEC_RELOAD, SERVICE_EXEC_STOP, SERVICE_EXEC_STOP_POST) ? EXEC_CONTROL_CGROUP : 0),
+ &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn next control task: %m");
+
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START_POST, SERVICE_STOP))
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ else if (s->state == SERVICE_STOP_POST)
+ service_enter_dead(s, SERVICE_FAILURE_RESOURCES, /* allow_restart= */ true);
+ else if (s->state == SERVICE_RELOAD) {
+ s->reload_result = SERVICE_FAILURE_RESOURCES;
+ service_enter_running(s, SERVICE_SUCCESS);
+ } else
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+ }
+}
+
+static void service_run_next_main(Service *s) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ int r;
+
+ assert(s);
+ assert(s->main_command);
+ assert(s->main_command->command_next);
+ assert(s->type == SERVICE_ONESHOT);
+
+ s->main_command = s->main_command->command_next;
+ service_unwatch_main_pid(s);
+
+ r = service_spawn(s,
+ s->main_command,
+ s->timeout_start_usec,
+ EXEC_PASS_FDS|EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG|EXEC_SETENV_MONITOR_RESULT|EXEC_WRITE_CREDENTIALS,
+ &pidref);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn next main task: %m");
+ service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+ return;
+ }
+
+ (void) service_set_main_pidref(s, &pidref);
+}
+
+static int service_start(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(s->state,
+ SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(s->state, SERVICE_CONDITION, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST))
+ return 0;
+
+ /* A service that will be restarted must be stopped first to trigger BindsTo and/or OnFailure
+ * dependencies. If a user does not want to wait for the holdoff time to elapse, the service should
+ * be manually restarted, not started. We simply return EAGAIN here, so that any start jobs stay
+ * queued, and assume that the auto restart timer will eventually trigger the restart. */
+ if (IN_SET(s->state, SERVICE_AUTO_RESTART, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED_BEFORE_AUTO_RESTART))
+ return -EAGAIN;
+
+ assert(IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED, SERVICE_AUTO_RESTART_QUEUED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ s->result = SERVICE_SUCCESS;
+ s->reload_result = SERVICE_SUCCESS;
+ s->main_pid_known = false;
+ s->main_pid_alien = false;
+ s->forbid_restart = false;
+
+ s->status_text = mfree(s->status_text);
+ s->status_errno = 0;
+
+ s->notify_access_override = _NOTIFY_ACCESS_INVALID;
+ s->notify_state = NOTIFY_UNKNOWN;
+
+ s->watchdog_original_usec = s->watchdog_usec;
+ s->watchdog_override_enable = false;
+ s->watchdog_override_usec = USEC_INFINITY;
+
+ exec_command_reset_status_list_array(s->exec_command, _SERVICE_EXEC_COMMAND_MAX);
+ exec_status_reset(&s->main_exec_status);
+
+ /* This is not an automatic restart? Flush the restart counter then */
+ if (s->flush_n_restarts) {
+ s->n_restarts = 0;
+ s->flush_n_restarts = false;
+ }
+
+ u->reset_accounting = true;
+
+ service_enter_condition(s);
+ return 1;
+}
+
+static int service_stop(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* Don't create restart jobs from manual stops. */
+ s->forbid_restart = true;
+
+ switch (s->state) {
+
+ case SERVICE_STOP:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ /* Already on it */
+ return 0;
+
+ case SERVICE_AUTO_RESTART:
+ case SERVICE_AUTO_RESTART_QUEUED:
+ /* Give up on the auto restart */
+ service_set_state(s, service_determine_dead_state(s));
+ return 0;
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ case SERVICE_RELOAD:
+ case SERVICE_RELOAD_SIGNAL:
+ case SERVICE_RELOAD_NOTIFY:
+ case SERVICE_STOP_WATCHDOG:
+ /* If there's already something running we go directly into kill mode. */
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
+ return 0;
+
+ case SERVICE_CLEANING:
+ /* If we are currently cleaning, then abort it, brutally. */
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_SUCCESS);
+ return 0;
+
+ case SERVICE_RUNNING:
+ case SERVICE_EXITED:
+ service_enter_stop(s, SERVICE_SUCCESS);
+ return 1;
+
+ case SERVICE_DEAD_BEFORE_AUTO_RESTART:
+ case SERVICE_FAILED_BEFORE_AUTO_RESTART:
+ case SERVICE_DEAD:
+ case SERVICE_FAILED:
+ case SERVICE_DEAD_RESOURCES_PINNED:
+ default:
+ /* Unknown state, or unit_stop() should already have handled these */
+ assert_not_reached();
+ }
+}
+
+static int service_reload(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ assert(IN_SET(s->state, SERVICE_RUNNING, SERVICE_EXITED));
+
+ service_enter_reload(s);
+ return 1;
+}
+
+static bool service_can_reload(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return s->exec_command[SERVICE_EXEC_RELOAD] ||
+ s->type == SERVICE_NOTIFY_RELOAD;
+}
+
+static unsigned service_exec_command_index(Unit *u, ServiceExecCommand id, const ExecCommand *current) {
+ Service *s = SERVICE(u);
+ unsigned idx = 0;
+
+ assert(s);
+ assert(id >= 0);
+ assert(id < _SERVICE_EXEC_COMMAND_MAX);
+
+ const ExecCommand *first = s->exec_command[id];
+
+ /* Figure out where we are in the list by walking back to the beginning */
+ for (const ExecCommand *c = current; c != first; c = c->command_prev)
+ idx++;
+
+ return idx;
+}
+
+static int service_serialize_exec_command(Unit *u, FILE *f, const ExecCommand *command) {
+ _cleanup_free_ char *args = NULL, *p = NULL;
+ Service *s = SERVICE(u);
+ const char *type, *key;
+ ServiceExecCommand id;
+ size_t length = 0;
+ unsigned idx;
+
+ assert(s);
+ assert(f);
+
+ if (!command)
+ return 0;
+
+ if (command == s->control_command) {
+ type = "control";
+ id = s->control_command_id;
+ } else {
+ type = "main";
+ id = SERVICE_EXEC_START;
+ }
+
+ idx = service_exec_command_index(u, id, command);
+
+ STRV_FOREACH(arg, command->argv) {
+ _cleanup_free_ char *e = NULL;
+ size_t n;
+
+ e = cescape(*arg);
+ if (!e)
+ return log_oom();
+
+ n = strlen(e);
+ if (!GREEDY_REALLOC(args, length + 2 + n + 2))
+ return log_oom();
+
+ if (length > 0)
+ args[length++] = ' ';
+
+ args[length++] = '"';
+ memcpy(args + length, e, n);
+ length += n;
+ args[length++] = '"';
+ }
+
+ if (!GREEDY_REALLOC(args, length + 1))
+ return log_oom();
+
+ args[length++] = 0;
+
+ p = cescape(command->path);
+ if (!p)
+ return log_oom();
+
+ key = strjoina(type, "-command");
+
+ /* We use '+1234' instead of '1234' to mark the last command in a sequence.
+ * This is used in service_deserialize_exec_command(). */
+ (void) serialize_item_format(
+ f, key,
+ "%s %s%u %s %s",
+ service_exec_command_to_string(id),
+ command->command_next ? "" : "+",
+ idx,
+ p, args);
+
+ return 0;
+}
+
+static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", service_state_to_string(s->state));
+ (void) serialize_item(f, "result", service_result_to_string(s->result));
+ (void) serialize_item(f, "reload-result", service_result_to_string(s->reload_result));
+
+ (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+ if (s->main_pid_known)
+ (void) serialize_pidref(f, fds, "main-pid", &s->main_pid);
+
+ (void) serialize_bool(f, "main-pid-known", s->main_pid_known);
+ (void) serialize_bool(f, "bus-name-good", s->bus_name_good);
+ (void) serialize_bool(f, "bus-name-owner", s->bus_name_owner);
+
+ (void) serialize_item_format(f, "n-restarts", "%u", s->n_restarts);
+ (void) serialize_bool(f, "flush-n-restarts", s->flush_n_restarts);
+
+ r = serialize_item_escaped(f, "status-text", s->status_text);
+ if (r < 0)
+ return r;
+
+ service_serialize_exec_command(u, f, s->control_command);
+ service_serialize_exec_command(u, f, s->main_command);
+
+ r = serialize_fd(f, fds, "stdin-fd", s->stdin_fd);
+ if (r < 0)
+ return r;
+ r = serialize_fd(f, fds, "stdout-fd", s->stdout_fd);
+ if (r < 0)
+ return r;
+ r = serialize_fd(f, fds, "stderr-fd", s->stderr_fd);
+ if (r < 0)
+ return r;
+
+ if (s->exec_fd_event_source) {
+ r = serialize_fd(f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source));
+ if (r < 0)
+ return r;
+
+ (void) serialize_bool(f, "exec-fd-hot", s->exec_fd_hot);
+ }
+
+ if (UNIT_ISSET(s->accept_socket)) {
+ r = serialize_item(f, "accept-socket", UNIT_DEREF(s->accept_socket)->id);
+ if (r < 0)
+ return r;
+ }
+
+ r = serialize_fd(f, fds, "socket-fd", s->socket_fd);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(fd_store, fs, s->fd_store) {
+ _cleanup_free_ char *c = NULL;
+ int copy;
+
+ copy = fdset_put_dup(fds, fs->fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to copy file descriptor for serialization: %m");
+
+ c = cescape(fs->fdname);
+ if (!c)
+ return log_oom();
+
+ (void) serialize_item_format(f, "fd-store-fd", "%i \"%s\" %i", copy, c, fs->do_poll);
+ }
+
+ if (s->main_exec_status.pid > 0) {
+ (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
+ (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
+ (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
+
+ if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
+ (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
+ (void) serialize_item_format(f, "main-exec-status-status", "%i", s->main_exec_status.status);
+ }
+ }
+
+ if (s->notify_access_override >= 0)
+ (void) serialize_item(f, "notify-access-override", notify_access_to_string(s->notify_access_override));
+
+ (void) serialize_dual_timestamp(f, "watchdog-timestamp", &s->watchdog_timestamp);
+ (void) serialize_bool(f, "forbid-restart", s->forbid_restart);
+
+ if (s->watchdog_override_enable)
+ (void) serialize_item_format(f, "watchdog-override-usec", USEC_FMT, s->watchdog_override_usec);
+
+ if (s->watchdog_original_usec != USEC_INFINITY)
+ (void) serialize_item_format(f, "watchdog-original-usec", USEC_FMT, s->watchdog_original_usec);
+
+ if (s->reload_begin_usec != USEC_INFINITY)
+ (void) serialize_item_format(f, "reload-begin-usec", USEC_FMT, s->reload_begin_usec);
+
+ return 0;
+}
+
+int service_deserialize_exec_command(
+ Unit *u,
+ const char *key,
+ const char *value) {
+
+ Service *s = SERVICE(u);
+ int r;
+ unsigned idx = 0, i;
+ bool control, found = false, last = false;
+ ServiceExecCommand id = _SERVICE_EXEC_COMMAND_INVALID;
+ ExecCommand *command = NULL;
+ _cleanup_free_ char *path = NULL;
+ _cleanup_strv_free_ char **argv = NULL;
+
+ enum ExecCommandState {
+ STATE_EXEC_COMMAND_TYPE,
+ STATE_EXEC_COMMAND_INDEX,
+ STATE_EXEC_COMMAND_PATH,
+ STATE_EXEC_COMMAND_ARGS,
+ _STATE_EXEC_COMMAND_MAX,
+ _STATE_EXEC_COMMAND_INVALID = -EINVAL,
+ } state;
+
+ assert(s);
+ assert(key);
+ assert(value);
+
+ control = streq(key, "control-command");
+
+ state = STATE_EXEC_COMMAND_TYPE;
+
+ for (;;) {
+ _cleanup_free_ char *arg = NULL;
+
+ r = extract_first_word(&value, &arg, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ switch (state) {
+ case STATE_EXEC_COMMAND_TYPE:
+ id = service_exec_command_from_string(arg);
+ if (id < 0)
+ return id;
+
+ state = STATE_EXEC_COMMAND_INDEX;
+ break;
+ case STATE_EXEC_COMMAND_INDEX:
+ /* PID 1234 is serialized as either '1234' or '+1234'. The second form is used to
+ * mark the last command in a sequence. We warn if the deserialized command doesn't
+ * match what we have loaded from the unit, but we don't need to warn if that is the
+ * last command. */
+
+ r = safe_atou(arg, &idx);
+ if (r < 0)
+ return r;
+ last = arg[0] == '+';
+
+ state = STATE_EXEC_COMMAND_PATH;
+ break;
+ case STATE_EXEC_COMMAND_PATH:
+ path = TAKE_PTR(arg);
+ state = STATE_EXEC_COMMAND_ARGS;
+ break;
+ case STATE_EXEC_COMMAND_ARGS:
+ r = strv_extend(&argv, arg);
+ if (r < 0)
+ return -ENOMEM;
+ break;
+ default:
+ assert_not_reached();
+ }
+ }
+
+ if (state != STATE_EXEC_COMMAND_ARGS)
+ return -EINVAL;
+ if (strv_isempty(argv))
+ return -EINVAL; /* At least argv[0] must be always present. */
+
+ /* Let's check whether exec command on given offset matches data that we just deserialized */
+ for (command = s->exec_command[id], i = 0; command; command = command->command_next, i++) {
+ if (i != idx)
+ continue;
+
+ found = strv_equal(argv, command->argv) && streq(command->path, path);
+ break;
+ }
+
+ if (!found) {
+ /* Command at the index we serialized is different, let's look for command that exactly
+ * matches but is on different index. If there is no such command we will not resume execution. */
+ for (command = s->exec_command[id]; command; command = command->command_next)
+ if (strv_equal(command->argv, argv) && streq(command->path, path))
+ break;
+ }
+
+ if (command && control) {
+ s->control_command = command;
+ s->control_command_id = id;
+ } else if (command)
+ s->main_command = command;
+ else if (last)
+ log_unit_debug(u, "Current command vanished from the unit file.");
+ else
+ log_unit_warning(u, "Current command vanished from the unit file, execution of the command list won't be resumed.");
+
+ return 0;
+}
+
+static int service_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ ServiceState state;
+
+ state = service_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ ServiceResult f;
+
+ f = service_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SERVICE_SUCCESS)
+ s->result = f;
+
+ } else if (streq(key, "reload-result")) {
+ ServiceResult f;
+
+ f = service_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse reload result value: %s", value);
+ else if (f != SERVICE_SUCCESS)
+ s->reload_result = f;
+
+ } else if (streq(key, "control-pid")) {
+ pidref_done(&s->control_pid);
+
+ (void) deserialize_pidref(fds, value, &s->control_pid);
+
+ } else if (streq(key, "main-pid")) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+ if (deserialize_pidref(fds, value, &pidref) >= 0)
+ (void) service_set_main_pidref(s, &pidref);
+
+ } else if (streq(key, "main-pid-known")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse main-pid-known value: %s", value);
+ else
+ s->main_pid_known = b;
+ } else if (streq(key, "bus-name-good")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse bus-name-good value: %s", value);
+ else
+ s->bus_name_good = b;
+ } else if (streq(key, "bus-name-owner")) {
+ r = free_and_strdup(&s->bus_name_owner, value);
+ if (r < 0)
+ log_unit_error_errno(u, r, "Unable to deserialize current bus owner %s: %m", value);
+ } else if (streq(key, "status-text")) {
+ char *t;
+ ssize_t l;
+
+ l = cunescape(value, 0, &t);
+ if (l < 0)
+ log_unit_debug_errno(u, l, "Failed to unescape status text '%s': %m", value);
+ else
+ free_and_replace(s->status_text, t);
+
+ } else if (streq(key, "accept-socket")) {
+ Unit *socket;
+
+ if (u->type != UNIT_SOCKET) {
+ log_unit_debug(u, "Failed to deserialize accept-socket: unit is not a socket");
+ return 0;
+ }
+
+ r = manager_load_unit(u->manager, value, NULL, NULL, &socket);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to load accept-socket unit '%s': %m", value);
+ else {
+ unit_ref_set(&s->accept_socket, u, socket);
+ SOCKET(socket)->n_connections++;
+ }
+
+ } else if (streq(key, "socket-fd")) {
+ asynchronous_close(s->socket_fd);
+ s->socket_fd = deserialize_fd(fds, value);
+
+ } else if (streq(key, "fd-store-fd")) {
+ _cleanup_free_ char *fdv = NULL, *fdn = NULL, *fdp = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int do_poll;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
+ return 0;
+ }
+
+ fd = deserialize_fd(fds, fdv);
+ if (fd < 0)
+ return 0;
+
+ r = extract_first_word(&value, &fdn, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse fd-store-fd value, ignoring: %s", value);
+ return 0;
+ }
+
+ r = extract_first_word(&value, &fdp, NULL, 0);
+ if (r == 0) {
+ /* If the value is not present, we assume the default */
+ do_poll = 1;
+ } else if (r < 0 || (r = safe_atoi(fdp, &do_poll)) < 0) {
+ log_unit_debug_errno(u, r, "Failed to parse fd-store-fd value \"%s\", ignoring: %m", value);
+ return 0;
+ }
+
+ r = service_add_fd_store(s, fd, fdn, do_poll);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to store deserialized fd %i, ignoring: %m", fd);
+ return 0;
+ }
+
+ TAKE_FD(fd);
+ } else if (streq(key, "main-exec-status-pid")) {
+ pid_t pid;
+
+ if (parse_pid(value, &pid) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-pid value: %s", value);
+ else
+ s->main_exec_status.pid = pid;
+ } else if (streq(key, "main-exec-status-code")) {
+ int i;
+
+ if (safe_atoi(value, &i) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-code value: %s", value);
+ else
+ s->main_exec_status.code = i;
+ } else if (streq(key, "main-exec-status-status")) {
+ int i;
+
+ if (safe_atoi(value, &i) < 0)
+ log_unit_debug(u, "Failed to parse main-exec-status-status value: %s", value);
+ else
+ s->main_exec_status.status = i;
+ } else if (streq(key, "main-exec-status-start"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
+ else if (streq(key, "main-exec-status-exit"))
+ deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
+ else if (streq(key, "notify-access-override")) {
+ NotifyAccess notify_access;
+
+ notify_access = notify_access_from_string(value);
+ if (notify_access < 0)
+ log_unit_debug(u, "Failed to parse notify-access-override value: %s", value);
+ else
+ s->notify_access_override = notify_access;
+ } else if (streq(key, "watchdog-timestamp"))
+ deserialize_dual_timestamp(value, &s->watchdog_timestamp);
+ else if (streq(key, "forbid-restart")) {
+ int b;
+
+ b = parse_boolean(value);
+ if (b < 0)
+ log_unit_debug(u, "Failed to parse forbid-restart value: %s", value);
+ else
+ s->forbid_restart = b;
+ } else if (streq(key, "stdin-fd")) {
+
+ asynchronous_close(s->stdin_fd);
+ s->stdin_fd = deserialize_fd(fds, value);
+ if (s->stdin_fd >= 0)
+ s->exec_context.stdio_as_fds = true;
+
+ } else if (streq(key, "stdout-fd")) {
+
+ asynchronous_close(s->stdout_fd);
+ s->stdout_fd = deserialize_fd(fds, value);
+ if (s->stdout_fd >= 0)
+ s->exec_context.stdio_as_fds = true;
+
+ } else if (streq(key, "stderr-fd")) {
+
+ asynchronous_close(s->stderr_fd);
+ s->stderr_fd = deserialize_fd(fds, value);
+ if (s->stderr_fd >= 0)
+ s->exec_context.stdio_as_fds = true;
+
+ } else if (streq(key, "exec-fd")) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = deserialize_fd(fds, value);
+ if (fd >= 0) {
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) >= 0)
+ TAKE_FD(fd);
+ }
+
+ } else if (streq(key, "watchdog-override-usec")) {
+ if (deserialize_usec(value, &s->watchdog_override_usec) < 0)
+ log_unit_debug(u, "Failed to parse watchdog_override_usec value: %s", value);
+ else
+ s->watchdog_override_enable = true;
+
+ } else if (streq(key, "watchdog-original-usec")) {
+ if (deserialize_usec(value, &s->watchdog_original_usec) < 0)
+ log_unit_debug(u, "Failed to parse watchdog_original_usec value: %s", value);
+
+ } else if (STR_IN_SET(key, "main-command", "control-command")) {
+ r = service_deserialize_exec_command(u, key, value);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized command \"%s\": %m", value);
+
+ } else if (streq(key, "n-restarts")) {
+ r = safe_atou(value, &s->n_restarts);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized restart counter '%s': %m", value);
+
+ } else if (streq(key, "flush-n-restarts")) {
+ r = parse_boolean(value);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized flush restart counter setting '%s': %m", value);
+ else
+ s->flush_n_restarts = r;
+ } else if (streq(key, "reload-begin-usec")) {
+ r = deserialize_usec(value, &s->reload_begin_usec);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to parse serialized reload begin timestamp '%s', ignoring: %m", value);
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState service_active_state(Unit *u) {
+ const UnitActiveState *table;
+
+ assert(u);
+
+ table = SERVICE(u)->type == SERVICE_IDLE ? state_translation_table_idle : state_translation_table;
+
+ return table[SERVICE(u)->state];
+}
+
+static const char *service_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return service_state_to_string(SERVICE(u)->state);
+}
+
+static bool service_may_gc(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* Never clean up services that still have a process around, even if the service is formally dead. Note that
+ * unit_may_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they
+ * have moved outside of the cgroup. */
+
+ if (main_pid_good(s) > 0 ||
+ control_pid_good(s) > 0)
+ return false;
+
+ /* Only allow collection of actually dead services, i.e. not those that are in the transitionary
+ * SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART states. */
+ if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED))
+ return false;
+
+ return true;
+}
+
+static int service_retry_pid_file(Service *s) {
+ int r;
+
+ assert(s->pid_file);
+ assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+
+ r = service_load_pid_file(s, false);
+ if (r < 0)
+ return r;
+
+ service_unwatch_pid_file(s);
+
+ service_enter_running(s, SERVICE_SUCCESS);
+ return 0;
+}
+
+static int service_watch_pid_file(Service *s) {
+ int r;
+
+ log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path);
+
+ r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to set a watch for PID file %s: %m", s->pid_file_pathspec->path);
+ service_unwatch_pid_file(s);
+ return r;
+ }
+
+ /* the pidfile might have appeared just before we set the watch */
+ log_unit_debug(UNIT(s), "Trying to read PID file %s in case it changed", s->pid_file_pathspec->path);
+ service_retry_pid_file(s);
+
+ return 0;
+}
+
+static int service_demand_pid_file(Service *s) {
+ _cleanup_free_ PathSpec *ps = NULL;
+
+ assert(s->pid_file);
+ assert(!s->pid_file_pathspec);
+
+ ps = new(PathSpec, 1);
+ if (!ps)
+ return -ENOMEM;
+
+ *ps = (PathSpec) {
+ .unit = UNIT(s),
+ .path = strdup(s->pid_file),
+ /* PATH_CHANGED would not be enough. There are daemons (sendmail) that keep their PID file
+ * open all the time. */
+ .type = PATH_MODIFIED,
+ .inotify_fd = -EBADF,
+ };
+
+ if (!ps->path)
+ return -ENOMEM;
+
+ path_simplify(ps->path);
+
+ s->pid_file_pathspec = TAKE_PTR(ps);
+
+ return service_watch_pid_file(s);
+}
+
+static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+ PathSpec *p = ASSERT_PTR(userdata);
+ Service *s;
+
+ s = SERVICE(p->unit);
+
+ assert(s);
+ assert(fd >= 0);
+ assert(IN_SET(s->state, SERVICE_START, SERVICE_START_POST));
+ assert(s->pid_file_pathspec);
+ assert(path_spec_owns_inotify_fd(s->pid_file_pathspec, fd));
+
+ log_unit_debug(UNIT(s), "inotify event");
+
+ if (path_spec_fd_event(p, events) < 0)
+ goto fail;
+
+ if (service_retry_pid_file(s) == 0)
+ return 0;
+
+ if (service_watch_pid_file(s) < 0)
+ goto fail;
+
+ return 0;
+
+fail:
+ service_unwatch_pid_file(s);
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
+ return 0;
+}
+
+static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) {
+ Service *s = SERVICE(userdata);
+
+ assert(s);
+
+ log_unit_debug(UNIT(s), "got exec-fd event");
+
+ /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
+ * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
+ * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
+ * parent. We need to be careful however, as there are other reasons that we might cause the child's side of
+ * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
+ * the child signalled us first that it is about to call the execve(). It does so by sending us a simple
+ * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
+ * sends a zero byte we'll ignore POLLHUP on the fd again. */
+
+ for (;;) {
+ uint8_t x;
+ ssize_t n;
+
+ n = read(fd, &x, sizeof(x));
+ if (n < 0) {
+ if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
+ return 0;
+
+ return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
+ }
+ if (n == 0) { /* EOF → the event we are waiting for */
+
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
+ log_unit_debug(UNIT(s), "Got EOF on exec-fd");
+
+ s->exec_fd_hot = false;
+
+ /* Nice! This is what we have been waiting for. Transition to next state. */
+ if (s->type == SERVICE_EXEC && s->state == SERVICE_START)
+ service_enter_start_post(s);
+ } else
+ log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring.");
+
+ return 0;
+ }
+
+ /* A byte was read → this turns on/off the exec fd logic */
+ assert(n == sizeof(x));
+ s->exec_fd_hot = x;
+ }
+
+ return 0;
+}
+
+static void service_notify_cgroup_empty_event(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+
+ log_unit_debug(u, "Control group is empty.");
+
+ switch (s->state) {
+
+ /* Waiting for SIGCHLD is usually more interesting, because it includes return
+ * codes/signals. Which is why we ignore the cgroup events for most cases, except when we
+ * don't know pid which to expect the SIGCHLD for. */
+
+ case SERVICE_START:
+ if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) &&
+ main_pid_good(s) == 0 &&
+ control_pid_good(s) == 0) {
+ /* No chance of getting a ready notification anymore */
+ service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+
+ if (s->exit_type == SERVICE_EXIT_CGROUP && main_pid_good(s) <= 0)
+ service_enter_start_post(s);
+
+ _fallthrough_;
+ case SERVICE_START_POST:
+ if (s->pid_file_pathspec &&
+ main_pid_good(s) == 0 &&
+ control_pid_good(s) == 0) {
+
+ /* Give up hoping for the daemon to write its PID file */
+ log_unit_warning(u, "Daemon never wrote its PID file. Failing.");
+
+ service_unwatch_pid_file(s);
+ if (s->state == SERVICE_START)
+ service_enter_stop_post(s, SERVICE_FAILURE_PROTOCOL);
+ else
+ service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+ }
+ break;
+
+ case SERVICE_RUNNING:
+ /* service_enter_running() will figure out what to do */
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+
+ if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+ service_enter_stop_post(s, SERVICE_SUCCESS);
+
+ break;
+
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ if (main_pid_good(s) <= 0 && control_pid_good(s) <= 0)
+ service_enter_dead(s, SERVICE_SUCCESS, true);
+
+ break;
+
+ /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
+ * up the cgroup earlier and should do it now. */
+ case SERVICE_AUTO_RESTART:
+ case SERVICE_AUTO_RESTART_QUEUED:
+ unit_prune_cgroup(u);
+ break;
+
+ default:
+ ;
+ }
+}
+
+static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) {
+ Service *s = SERVICE(u);
+
+ if (managed_oom)
+ log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd.");
+ else
+ log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+ if (s->oom_policy == OOM_CONTINUE)
+ return;
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ case SERVICE_STOP:
+ if (s->oom_policy == OOM_STOP)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL);
+ else if (s->oom_policy == OOM_KILL)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+ break;
+
+ case SERVICE_EXITED:
+ case SERVICE_RUNNING:
+ if (s->oom_policy == OOM_STOP)
+ service_enter_stop(s, SERVICE_FAILURE_OOM_KILL);
+ else if (s->oom_policy == OOM_KILL)
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+ break;
+
+ case SERVICE_STOP_SIGKILL:
+ case SERVICE_FINAL_SIGKILL:
+ if (s->result == SERVICE_SUCCESS)
+ s->result = SERVICE_FAILURE_OOM_KILL;
+ break;
+
+ case SERVICE_STOP_POST:
+ case SERVICE_FINAL_SIGTERM:
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+ break;
+
+ default:
+ ;
+ }
+}
+
+static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ bool notify_dbus = true;
+ Service *s = SERVICE(u);
+ ServiceResult f;
+ ExitClean clean_mode;
+
+ assert(s);
+ assert(pid >= 0);
+
+ /* Oneshot services and non-SERVICE_EXEC_START commands should not be
+ * considered daemons as they are typically not long running. */
+ if (s->type == SERVICE_ONESHOT || (s->control_pid.pid == pid && s->control_command_id != SERVICE_EXEC_START))
+ clean_mode = EXIT_CLEAN_COMMAND;
+ else
+ clean_mode = EXIT_CLEAN_DAEMON;
+
+ if (is_clean_exit(code, status, clean_mode, &s->success_status))
+ f = SERVICE_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SERVICE_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SERVICE_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SERVICE_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->main_pid.pid == pid) {
+ /* Clean up the exec_fd event source. We want to do this here, not later in
+ * service_set_state(), because service_enter_stop_post() calls service_spawn().
+ * The source owns its end of the pipe, so this will close that too. */
+ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
+
+ /* Forking services may occasionally move to a new PID.
+ * As long as they update the PID file before exiting the old
+ * PID, they're fine. */
+ if (service_load_pid_file(s, false) > 0)
+ return;
+
+ pidref_done(&s->main_pid);
+ exec_status_exit(&s->main_exec_status, &s->exec_context, pid, code, status);
+
+ if (s->main_command) {
+ /* If this is not a forking service than the
+ * main process got started and hence we copy
+ * the exit status so that it is recorded both
+ * as main and as control process exit
+ * status */
+
+ s->main_command->exec_status = s->main_exec_status;
+
+ if (s->main_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ } else if (s->exec_command[SERVICE_EXEC_START]) {
+
+ /* If this is a forked process, then we should
+ * ignore the return value if this was
+ * configured for the starter process */
+
+ if (s->exec_command[SERVICE_EXEC_START]->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Main process",
+ service_exec_command_to_string(SERVICE_EXEC_START),
+ f == SERVICE_SUCCESS,
+ code, status);
+
+ if (s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->main_command &&
+ s->main_command->command_next &&
+ s->type == SERVICE_ONESHOT &&
+ f == SERVICE_SUCCESS) {
+
+ /* There is another command to execute, so let's do that. */
+
+ log_unit_debug(u, "Running next main command for state %s.", service_state_to_string(s->state));
+ service_run_next_main(s);
+
+ } else {
+ s->main_command = NULL;
+
+ /* Services with ExitType=cgroup do not act on main PID exiting, unless the cgroup is
+ * already empty */
+ if (s->exit_type == SERVICE_EXIT_MAIN || cgroup_good(s) <= 0) {
+ /* The service exited, so the service is officially gone. */
+ switch (s->state) {
+
+ case SERVICE_START_POST:
+ case SERVICE_RELOAD:
+ case SERVICE_RELOAD_SIGNAL:
+ case SERVICE_RELOAD_NOTIFY:
+ /* If neither main nor control processes are running then the current
+ * state can never exit cleanly, hence immediately terminate the
+ * service. */
+ if (control_pid_good(s) <= 0)
+ service_enter_stop(s, f);
+
+ /* Otherwise need to wait until the operation is done. */
+ break;
+
+ case SERVICE_STOP:
+ /* Need to wait until the operation is done. */
+ break;
+
+ case SERVICE_START:
+ if (s->type == SERVICE_ONESHOT) {
+ /* This was our main goal, so let's go on */
+ if (f == SERVICE_SUCCESS)
+ service_enter_start_post(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ } else if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD)) {
+ /* Only enter running through a notification, so that the
+ * SERVICE_START state signifies that no ready notification
+ * has been received */
+ if (f != SERVICE_SUCCESS)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ else if (!s->remain_after_exit || service_get_notify_access(s) == NOTIFY_MAIN)
+ /* The service has never been and will never be active */
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+
+ _fallthrough_;
+ case SERVICE_RUNNING:
+ service_enter_running(s, f);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_stop_post(s, f);
+
+ /* If there is still a control process, wait for that first */
+ break;
+
+ case SERVICE_STOP_POST:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+
+ if (control_pid_good(s) <= 0)
+ service_enter_dead(s, f, true);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ } else if (s->exit_type == SERVICE_EXIT_CGROUP && s->state == SERVICE_START)
+ /* If a main process exits very quickly, this function might be executed
+ * before service_dispatch_exec_io(). Since this function disabled IO events
+ * to monitor the main process above, we need to update the state here too.
+ * Let's consider the process is successfully launched and exited. */
+ service_enter_start_post(s);
+ }
+
+ } else if (s->control_pid.pid == pid) {
+ const char *kind;
+ bool success;
+
+ pidref_done(&s->control_pid);
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SERVICE_SUCCESS;
+ }
+
+ /* ExecCondition= calls that exit with (0, 254] should invoke skip-like behavior instead of failing */
+ if (s->state == SERVICE_CONDITION) {
+ if (f == SERVICE_FAILURE_EXIT_CODE && status < 255) {
+ UNIT(s)->condition_result = false;
+ f = SERVICE_SKIP_CONDITION;
+ success = true;
+ } else if (f == SERVICE_SUCCESS) {
+ UNIT(s)->condition_result = true;
+ success = true;
+ } else
+ success = false;
+
+ kind = "Condition check process";
+ } else {
+ kind = "Control process";
+ success = f == SERVICE_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ kind,
+ service_exec_command_to_string(s->control_command_id),
+ success,
+ code, status);
+
+ if (s->state != SERVICE_RELOAD && s->result == SERVICE_SUCCESS)
+ s->result = f;
+
+ if (s->control_command &&
+ s->control_command->command_next &&
+ f == SERVICE_SUCCESS) {
+
+ /* There is another command to * execute, so let's do that. */
+
+ log_unit_debug(u, "Running next control command for state %s.", service_state_to_string(s->state));
+ service_run_next_control(s);
+
+ } else {
+ /* No further commands for this step, so let's figure out what to do next */
+
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ log_unit_debug(u, "Got final SIGCHLD for state %s.", service_state_to_string(s->state));
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ if (f == SERVICE_SUCCESS)
+ service_enter_start_pre(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_START_PRE:
+ if (f == SERVICE_SUCCESS)
+ service_enter_start(s);
+ else
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_START:
+ if (s->type != SERVICE_FORKING)
+ /* Maybe spurious event due to a reload that changed the type? */
+ break;
+
+ if (f != SERVICE_SUCCESS) {
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ }
+
+ if (s->pid_file) {
+ bool has_start_post;
+ int r;
+
+ /* Let's try to load the pid file here if we can.
+ * The PID file might actually be created by a START_POST
+ * script. In that case don't worry if the loading fails. */
+
+ has_start_post = s->exec_command[SERVICE_EXEC_START_POST];
+ r = service_load_pid_file(s, !has_start_post);
+ if (!has_start_post && r < 0) {
+ r = service_demand_pid_file(s);
+ if (r < 0 || cgroup_good(s) == 0)
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+ } else
+ service_search_main_pid(s);
+
+ service_enter_start_post(s);
+ break;
+
+ case SERVICE_START_POST:
+ if (f != SERVICE_SUCCESS) {
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+ }
+
+ if (s->pid_file) {
+ int r;
+
+ r = service_load_pid_file(s, true);
+ if (r < 0) {
+ r = service_demand_pid_file(s);
+ if (r < 0 || cgroup_good(s) == 0)
+ service_enter_stop(s, SERVICE_FAILURE_PROTOCOL);
+ break;
+ }
+ } else
+ service_search_main_pid(s);
+
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_RELOAD:
+ case SERVICE_RELOAD_SIGNAL:
+ case SERVICE_RELOAD_NOTIFY:
+ if (f == SERVICE_SUCCESS)
+ if (service_load_pid_file(s, true) < 0)
+ service_search_main_pid(s);
+
+ s->reload_result = f;
+
+ /* If the last notification we received from the service process indicates
+ * we are still reloading, then don't leave reloading state just yet, just
+ * transition into SERVICE_RELOAD_NOTIFY, to wait for the READY=1 coming,
+ * too. */
+ if (s->notify_state == NOTIFY_RELOADING)
+ service_set_state(s, SERVICE_RELOAD_NOTIFY);
+ else
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP:
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ case SERVICE_STOP_SIGTERM:
+ case SERVICE_STOP_SIGKILL:
+ if (main_pid_good(s) <= 0)
+ service_enter_stop_post(s, f);
+
+ /* If there is still a service process around, wait until
+ * that one quit, too */
+ break;
+
+ case SERVICE_STOP_POST:
+ if (main_pid_good(s) <= 0)
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, f);
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ case SERVICE_FINAL_SIGTERM:
+ case SERVICE_FINAL_SIGKILL:
+ if (main_pid_good(s) <= 0)
+ service_enter_dead(s, f, true);
+ break;
+
+ case SERVICE_CLEANING:
+
+ if (s->clean_result == SERVICE_SUCCESS)
+ s->clean_result = f;
+
+ service_enter_dead(s, SERVICE_SUCCESS, false);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+ } else /* Neither control nor main PID? If so, don't notify about anything */
+ notify_dbus = false;
+
+ /* Notify clients about changed exit status */
+ if (notify_dbus)
+ unit_add_to_dbus_queue(u);
+
+ /* We watch the main/control process otherwise we can't retrieve the unit they
+ * belong to with cgroupv1. But if they are not our direct child, we won't get a
+ * SIGCHLD for them. Therefore we need to look for others to watch so we can
+ * detect when the cgroup becomes empty. Note that the control process is always
+ * our child so it's pointless to watch all other processes. */
+ if (!control_pid_good(s))
+ if (!s->main_pid_known || s->main_pid_alien)
+ (void) unit_enqueue_rewatch_pids(u);
+}
+
+static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Service *s = SERVICE(userdata);
+
+ assert(s);
+ assert(source == s->timer_event_source);
+
+ switch (s->state) {
+
+ case SERVICE_CONDITION:
+ case SERVICE_START_PRE:
+ case SERVICE_START:
+ case SERVICE_START_POST:
+ switch (s->timeout_start_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "%s operation timed out. Terminating.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "%s operation timed out. Aborting.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "%s operation timed out. Killing.", service_state_to_string(s->state));
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "%s operation timed out. Skipping SIGKILL.", service_state_to_string(s->state));
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_RUNNING:
+ log_unit_warning(UNIT(s), "Service reached runtime time limit. Stopping.");
+ service_enter_stop(s, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_RELOAD:
+ case SERVICE_RELOAD_SIGNAL:
+ case SERVICE_RELOAD_NOTIFY:
+ log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process.");
+ service_kill_control_process(s);
+ s->reload_result = SERVICE_FAILURE_TIMEOUT;
+ service_enter_running(s, SERVICE_SUCCESS);
+ break;
+
+ case SERVICE_STOP:
+ switch (s->timeout_stop_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+ service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "Stopping timed out. Aborting.");
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_STOP_WATCHDOG:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-watchdog' timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SERVICE_STOP_SIGTERM:
+ if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ } else if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Killing.");
+ service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-sigterm' timed out. Skipping SIGKILL.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ }
+
+ break;
+
+ case SERVICE_STOP_SIGKILL:
+ /* Uh, we sent a SIGKILL and it is still not gone?
+ * Must be something we cannot kill, so let's just be
+ * weirded out and continue */
+
+ log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+ service_enter_stop_post(s, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_STOP_POST:
+ switch (s->timeout_stop_failure_mode) {
+
+ case SERVICE_TIMEOUT_TERMINATE:
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Terminating.");
+ service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_ABORT:
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ break;
+
+ case SERVICE_TIMEOUT_KILL:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'stop-post' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case SERVICE_FINAL_WATCHDOG:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'final-watchdog' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+ break;
+
+ case SERVICE_FINAL_SIGTERM:
+ if (s->timeout_stop_failure_mode == SERVICE_TIMEOUT_ABORT) {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Aborting.");
+ service_enter_signal(s, SERVICE_FINAL_WATCHDOG, SERVICE_FAILURE_TIMEOUT);
+ } else if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Killing.");
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "State 'final-sigterm' timed out. Skipping SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, false);
+ }
+
+ break;
+
+ case SERVICE_FINAL_SIGKILL:
+ log_unit_warning(UNIT(s), "Processes still around after final SIGKILL. Entering failed mode.");
+ service_enter_dead(s, SERVICE_FAILURE_TIMEOUT, true);
+ break;
+
+ case SERVICE_AUTO_RESTART:
+ if (s->restart_usec > 0)
+ log_unit_debug(UNIT(s),
+ "Service restart interval %s expired, scheduling restart.",
+ FORMAT_TIMESPAN(service_restart_usec_next(s), USEC_PER_SEC));
+ else
+ log_unit_debug(UNIT(s),
+ "Service has no hold-off time (RestartSec=0), scheduling restart.");
+
+ service_enter_restart(s);
+ break;
+
+ case SERVICE_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SERVICE_SUCCESS)
+ s->clean_result = SERVICE_FAILURE_TIMEOUT;
+
+ service_enter_signal(s, SERVICE_FINAL_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata) {
+ Service *s = SERVICE(userdata);
+ usec_t watchdog_usec;
+
+ assert(s);
+ assert(source == s->watchdog_event_source);
+
+ watchdog_usec = service_get_watchdog_usec(s);
+
+ if (UNIT(s)->manager->service_watchdogs) {
+ log_unit_error(UNIT(s), "Watchdog timeout (limit %s)!",
+ FORMAT_TIMESPAN(watchdog_usec, 1));
+
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+ } else
+ log_unit_warning(UNIT(s), "Watchdog disabled! Ignoring watchdog timeout (limit %s)!",
+ FORMAT_TIMESPAN(watchdog_usec, 1));
+
+ return 0;
+}
+
+static bool service_notify_message_authorized(Service *s, pid_t pid, FDSet *fds) {
+ assert(s);
+
+ NotifyAccess notify_access = service_get_notify_access(s);
+
+ if (notify_access == NOTIFY_NONE) {
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception is disabled.", pid);
+ return false;
+ }
+
+ if (notify_access == NOTIFY_MAIN && pid != s->main_pid.pid) {
+ if (pidref_is_set(&s->main_pid))
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+ else
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID which is currently not known", pid);
+
+ return false;
+ }
+
+ if (notify_access == NOTIFY_EXEC && pid != s->main_pid.pid && pid != s->control_pid.pid) {
+ if (pidref_is_set(&s->main_pid) && pidref_is_set(&s->control_pid))
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT" and control PID "PID_FMT,
+ pid, s->main_pid.pid, s->control_pid.pid);
+ else if (pidref_is_set(&s->main_pid))
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID "PID_FMT, pid, s->main_pid.pid);
+ else if (pidref_is_set(&s->control_pid))
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for control PID "PID_FMT, pid, s->control_pid.pid);
+ else
+ log_unit_warning(UNIT(s), "Got notification message from PID "PID_FMT", but reception only permitted for main PID and control PID which are currently not known", pid);
+
+ return false;
+ }
+
+ return true;
+}
+
+static void service_force_watchdog(Service *s) {
+ if (!UNIT(s)->manager->service_watchdogs)
+ return;
+
+ log_unit_error(UNIT(s), "Watchdog request (last status: %s)!",
+ s->status_text ?: "<unset>");
+
+ service_enter_signal(s, SERVICE_STOP_WATCHDOG, SERVICE_FAILURE_WATCHDOG);
+}
+
+static void service_notify_message(
+ Unit *u,
+ const struct ucred *ucred,
+ char * const *tags,
+ FDSet *fds) {
+
+ Service *s = SERVICE(u);
+ bool notify_dbus = false;
+ usec_t monotonic_usec = USEC_INFINITY;
+ const char *e;
+ int r;
+
+ assert(u);
+ assert(ucred);
+
+ if (!service_notify_message_authorized(s, ucred->pid, fds))
+ return;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *cc = NULL;
+
+ cc = strv_join(tags, ", ");
+ log_unit_debug(u, "Got notification message from PID "PID_FMT" (%s)", ucred->pid, empty_to_na(cc));
+ }
+
+ /* Interpret MAINPID= */
+ e = strv_find_startswith(tags, "MAINPID=");
+ if (e && IN_SET(s->state, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY)) {
+ _cleanup_(pidref_done) PidRef new_main_pid = PIDREF_NULL;
+
+ r = pidref_set_pidstr(&new_main_pid, e);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to parse MAINPID=%s field in notification message, ignoring: %m", e);
+ else if (!s->main_pid_known || !pidref_equal(&new_main_pid, &s->main_pid)) {
+
+ r = service_is_suitable_main_pid(s, &new_main_pid, LOG_WARNING);
+ if (r == 0) {
+ /* The new main PID is a bit suspicious, which is OK if the sender is privileged. */
+
+ if (ucred->uid == 0) {
+ log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, but we'll accept it as the request to change it came from a privileged process.", new_main_pid.pid);
+ r = 1;
+ } else
+ log_unit_debug(u, "New main PID "PID_FMT" does not belong to service, refusing.", new_main_pid.pid);
+ }
+ if (r > 0) {
+ (void) service_set_main_pidref(s, &new_main_pid);
+
+ r = unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch new main PID "PID_FMT" for service: %m", s->main_pid.pid);
+
+ notify_dbus = true;
+ }
+ }
+ }
+
+ /* Parse MONOTONIC_USEC= */
+ e = strv_find_startswith(tags, "MONOTONIC_USEC=");
+ if (e) {
+ r = safe_atou64(e, &monotonic_usec);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to parse MONOTONIC_USEC= field in notification message, ignoring: %s", e);
+ }
+
+ /* Interpret READY=/STOPPING=/RELOADING=. STOPPING= wins over the others, and READY= over RELOADING= */
+ if (strv_contains(tags, "STOPPING=1")) {
+ s->notify_state = NOTIFY_STOPPING;
+
+ if (IN_SET(s->state, SERVICE_RUNNING, SERVICE_RELOAD_SIGNAL, SERVICE_RELOAD_NOTIFY))
+ service_enter_stop_by_notify(s);
+
+ notify_dbus = true;
+
+ } else if (strv_contains(tags, "READY=1")) {
+
+ s->notify_state = NOTIFY_READY;
+
+ /* Type=notify services inform us about completed initialization with READY=1 */
+ if (IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD) &&
+ s->state == SERVICE_START)
+ service_enter_start_post(s);
+
+ /* Sending READY=1 while we are reloading informs us that the reloading is complete. */
+ if (s->state == SERVICE_RELOAD_NOTIFY)
+ service_enter_running(s, SERVICE_SUCCESS);
+
+ /* Combined RELOADING=1 and READY=1? Then this is indication that the service started and
+ * immediately finished reloading. */
+ if (s->state == SERVICE_RELOAD_SIGNAL &&
+ strv_contains(tags, "RELOADING=1") &&
+ monotonic_usec != USEC_INFINITY &&
+ monotonic_usec >= s->reload_begin_usec) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ /* Propagate a reload explicitly */
+ r = manager_propagate_reload(UNIT(s)->manager, UNIT(s), JOB_FAIL, &error);
+ if (r < 0)
+ log_unit_warning(UNIT(s), "Failed to schedule propagation of reload, ignoring: %s", bus_error_message(&error, r));
+
+ service_enter_running(s, SERVICE_SUCCESS);
+ }
+
+ notify_dbus = true;
+
+ } else if (strv_contains(tags, "RELOADING=1")) {
+
+ s->notify_state = NOTIFY_RELOADING;
+
+ /* Sending RELOADING=1 after we send SIGHUP to request a reload will transition
+ * things to "reload-notify" state, where we'll wait for READY=1 to let us know the
+ * reload is done. Note that we insist on a timestamp being sent along here, so that
+ * we know for sure this is a reload cycle initiated *after* we sent the signal */
+ if (s->state == SERVICE_RELOAD_SIGNAL &&
+ monotonic_usec != USEC_INFINITY &&
+ monotonic_usec >= s->reload_begin_usec)
+ /* Note, we don't call service_enter_reload_by_notify() here, because we
+ * don't need reload propagation nor do we want to restart the time-out. */
+ service_set_state(s, SERVICE_RELOAD_NOTIFY);
+
+ if (s->state == SERVICE_RUNNING)
+ service_enter_reload_by_notify(s);
+
+ notify_dbus = true;
+ }
+
+ /* Interpret STATUS= */
+ e = strv_find_startswith(tags, "STATUS=");
+ if (e) {
+ _cleanup_free_ char *t = NULL;
+
+ if (!isempty(e)) {
+ /* Note that this size limit check is mostly paranoia: since the datagram size we are willing
+ * to process is already limited to NOTIFY_BUFFER_MAX, this limit here should never be hit. */
+ if (strlen(e) > STATUS_TEXT_MAX)
+ log_unit_warning(u, "Status message overly long (%zu > %u), ignoring.", strlen(e), STATUS_TEXT_MAX);
+ else if (!utf8_is_valid(e))
+ log_unit_warning(u, "Status message in notification message is not UTF-8 clean, ignoring.");
+ else {
+ t = strdup(e);
+ if (!t)
+ log_oom();
+ }
+ }
+
+ if (!streq_ptr(s->status_text, t)) {
+ free_and_replace(s->status_text, t);
+ notify_dbus = true;
+ }
+ }
+
+ /* Interpret NOTIFYACCESS= */
+ e = strv_find_startswith(tags, "NOTIFYACCESS=");
+ if (e) {
+ NotifyAccess notify_access;
+
+ notify_access = notify_access_from_string(e);
+ if (notify_access < 0)
+ log_unit_warning_errno(u, notify_access,
+ "Failed to parse NOTIFYACCESS= field value '%s' in notification message, ignoring: %m", e);
+
+ /* We don't need to check whether the new access mode is more strict than what is
+ * already in use, since only the privileged process is allowed to change it
+ * in the first place. */
+ if (service_get_notify_access(s) != notify_access) {
+ service_override_notify_access(s, notify_access);
+ notify_dbus = true;
+ }
+ }
+
+ /* Interpret ERRNO= */
+ e = strv_find_startswith(tags, "ERRNO=");
+ if (e) {
+ int status_errno;
+
+ status_errno = parse_errno(e);
+ if (status_errno < 0)
+ log_unit_warning_errno(u, status_errno,
+ "Failed to parse ERRNO= field value '%s' in notification message: %m", e);
+ else if (s->status_errno != status_errno) {
+ s->status_errno = status_errno;
+ notify_dbus = true;
+ }
+ }
+
+ /* Interpret EXTEND_TIMEOUT= */
+ e = strv_find_startswith(tags, "EXTEND_TIMEOUT_USEC=");
+ if (e) {
+ usec_t extend_timeout_usec;
+ if (safe_atou64(e, &extend_timeout_usec) < 0)
+ log_unit_warning(u, "Failed to parse EXTEND_TIMEOUT_USEC=%s", e);
+ else
+ service_extend_timeout(s, extend_timeout_usec);
+ }
+
+ /* Interpret WATCHDOG= */
+ e = strv_find_startswith(tags, "WATCHDOG=");
+ if (e) {
+ if (streq(e, "1"))
+ service_reset_watchdog(s);
+ else if (streq(e, "trigger"))
+ service_force_watchdog(s);
+ else
+ log_unit_warning(u, "Passed WATCHDOG= field is invalid, ignoring.");
+ }
+
+ e = strv_find_startswith(tags, "WATCHDOG_USEC=");
+ if (e) {
+ usec_t watchdog_override_usec;
+ if (safe_atou64(e, &watchdog_override_usec) < 0)
+ log_unit_warning(u, "Failed to parse WATCHDOG_USEC=%s", e);
+ else
+ service_override_watchdog_timeout(s, watchdog_override_usec);
+ }
+
+ /* Process FD store messages. Either FDSTOREREMOVE=1 for removal, or FDSTORE=1 for addition. In both cases,
+ * process FDNAME= for picking the file descriptor name to use. Note that FDNAME= is required when removing
+ * fds, but optional when pushing in new fds, for compatibility reasons. */
+ if (strv_contains(tags, "FDSTOREREMOVE=1")) {
+ const char *name;
+
+ name = strv_find_startswith(tags, "FDNAME=");
+ if (!name || !fdname_is_valid(name))
+ log_unit_warning(u, "FDSTOREREMOVE=1 requested, but no valid file descriptor name passed, ignoring.");
+ else
+ service_remove_fd_store(s, name);
+
+ } else if (strv_contains(tags, "FDSTORE=1")) {
+ const char *name;
+
+ name = strv_find_startswith(tags, "FDNAME=");
+ if (name && !fdname_is_valid(name)) {
+ log_unit_warning(u, "Passed FDNAME= name is invalid, ignoring.");
+ name = NULL;
+ }
+
+ (void) service_add_fd_store_set(s, fds, name, !strv_contains(tags, "FDPOLL=0"));
+ }
+
+ /* Notify clients about changed status or main pid */
+ if (notify_dbus)
+ unit_add_to_dbus_queue(u);
+}
+
+static int service_get_timeout(Unit *u, usec_t *timeout) {
+ Service *s = SERVICE(u);
+ uint64_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static usec_t service_get_timeout_start_usec(Unit *u) {
+ Service *s = SERVICE(ASSERT_PTR(u));
+ return s->timeout_start_usec;
+}
+
+static bool pick_up_pid_from_bus_name(Service *s) {
+ assert(s);
+
+ /* If the service is running but we have no main PID yet, get it from the owner of the D-Bus name */
+
+ return !pidref_is_set(&s->main_pid) &&
+ IN_SET(s->state,
+ SERVICE_START,
+ SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD,
+ SERVICE_RELOAD_SIGNAL,
+ SERVICE_RELOAD_NOTIFY);
+}
+
+static int bus_name_pid_lookup_callback(sd_bus_message *reply, void *userdata, sd_bus_error *ret_error) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ const sd_bus_error *e;
+ Unit *u = ASSERT_PTR(userdata);
+ uint32_t pid;
+ Service *s;
+ int r;
+
+ assert(reply);
+
+ s = SERVICE(u);
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ if (!s->bus_name || !pick_up_pid_from_bus_name(s))
+ return 1;
+
+ e = sd_bus_message_get_error(reply);
+ if (e) {
+ r = sd_bus_error_get_errno(e);
+ log_warning_errno(r, "GetConnectionUnixProcessID() failed: %s", bus_error_message(e, r));
+ return 1;
+ }
+
+ r = sd_bus_message_read(reply, "u", &pid);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 1;
+ }
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0) {
+ log_debug_errno(r, "GetConnectionUnixProcessID() returned invalid PID: %m");
+ return 1;
+ }
+
+ log_unit_debug(u, "D-Bus name %s is now owned by process " PID_FMT, s->bus_name, pidref.pid);
+
+ (void) service_set_main_pidref(s, &pidref);
+ (void) unit_watch_pidref(UNIT(s), &s->main_pid, /* exclusive= */ false);
+ return 1;
+}
+
+static void service_bus_name_owner_change(Unit *u, const char *new_owner) {
+
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ if (new_owner)
+ log_unit_debug(u, "D-Bus name %s now owned by %s", s->bus_name, new_owner);
+ else
+ log_unit_debug(u, "D-Bus name %s now not owned by anyone.", s->bus_name);
+
+ s->bus_name_good = new_owner;
+
+ /* Track the current owner, so we can reconstruct changes after a daemon reload */
+ r = free_and_strdup(&s->bus_name_owner, new_owner);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Unable to set new bus name owner %s: %m", new_owner);
+ return;
+ }
+
+ if (s->type == SERVICE_DBUS) {
+
+ /* service_enter_running() will figure out what to
+ * do */
+ if (s->state == SERVICE_RUNNING)
+ service_enter_running(s, SERVICE_SUCCESS);
+ else if (s->state == SERVICE_START && new_owner)
+ service_enter_start_post(s);
+
+ } else if (new_owner && pick_up_pid_from_bus_name(s)) {
+
+ /* Try to acquire PID from bus service */
+
+ s->bus_name_pid_lookup_slot = sd_bus_slot_unref(s->bus_name_pid_lookup_slot);
+
+ r = sd_bus_call_method_async(
+ u->manager->api_bus,
+ &s->bus_name_pid_lookup_slot,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.DBus",
+ "GetConnectionUnixProcessID",
+ bus_name_pid_lookup_callback,
+ s,
+ "s",
+ s->bus_name);
+ if (r < 0)
+ log_debug_errno(r, "Failed to request owner PID of service name, ignoring: %m");
+ }
+}
+
+int service_set_socket_fd(
+ Service *s,
+ int fd,
+ Socket *sock,
+ SocketPeer *peer,
+ bool selinux_context_net) {
+
+ _cleanup_free_ char *peer_text = NULL;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ /* This is called by the socket code when instantiating a new service for a stream socket and the socket needs
+ * to be configured. We take ownership of the passed fd on success. */
+
+ if (UNIT(s)->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ if (s->socket_fd >= 0)
+ return -EBUSY;
+
+ assert(!s->socket_peer);
+
+ if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED))
+ return -EAGAIN;
+
+ if (getpeername_pretty(fd, true, &peer_text) >= 0) {
+
+ if (UNIT(s)->description) {
+ _cleanup_free_ char *a = NULL;
+
+ a = strjoin(UNIT(s)->description, " (", peer_text, ")");
+ if (!a)
+ return -ENOMEM;
+
+ r = unit_set_description(UNIT(s), a);
+ } else
+ r = unit_set_description(UNIT(s), peer_text);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_two_dependencies(UNIT(sock), UNIT_BEFORE, UNIT_TRIGGERS, UNIT(s), false, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+
+ s->socket_fd = fd;
+ s->socket_peer = socket_peer_ref(peer);
+ s->socket_fd_selinux_context_net = selinux_context_net;
+
+ unit_ref_set(&s->accept_socket, UNIT(s), UNIT(sock));
+ return 0;
+}
+
+static void service_reset_failed(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ if (s->state == SERVICE_FAILED)
+ service_set_state(s, service_determine_dead_state(s));
+
+ s->result = SERVICE_SUCCESS;
+ s->reload_result = SERVICE_SUCCESS;
+ s->clean_result = SERVICE_SUCCESS;
+ s->n_restarts = 0;
+ s->flush_n_restarts = false;
+}
+
+static PidRef* service_main_pid(Unit *u) {
+ return &ASSERT_PTR(SERVICE(u))->main_pid;
+}
+
+static PidRef* service_control_pid(Unit *u) {
+ return &ASSERT_PTR(SERVICE(u))->control_pid;
+}
+
+static bool service_needs_console(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ /* We provide our own implementation of this here, instead of relying of the generic implementation
+ * unit_needs_console() provides, since we want to return false if we are in SERVICE_EXITED state. */
+
+ if (!exec_context_may_touch_console(&s->exec_context))
+ return false;
+
+ return IN_SET(s->state,
+ SERVICE_CONDITION,
+ SERVICE_START_PRE,
+ SERVICE_START,
+ SERVICE_START_POST,
+ SERVICE_RUNNING,
+ SERVICE_RELOAD,
+ SERVICE_RELOAD_SIGNAL,
+ SERVICE_RELOAD_NOTIFY,
+ SERVICE_STOP,
+ SERVICE_STOP_WATCHDOG,
+ SERVICE_STOP_SIGTERM,
+ SERVICE_STOP_SIGKILL,
+ SERVICE_STOP_POST,
+ SERVICE_FINAL_WATCHDOG,
+ SERVICE_FINAL_SIGTERM,
+ SERVICE_FINAL_SIGKILL);
+}
+
+static int service_exit_status(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(u);
+
+ if (s->main_exec_status.pid <= 0 ||
+ !dual_timestamp_is_set(&s->main_exec_status.exit_timestamp))
+ return -ENODATA;
+
+ if (s->main_exec_status.code != CLD_EXITED)
+ return -EBADE;
+
+ return s->main_exec_status.status;
+}
+
+static const char* service_status_text(Unit *u) {
+ Service *s = SERVICE(u);
+
+ assert(s);
+
+ return s->status_text;
+}
+
+static int service_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ bool may_clean_fdstore = false;
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_DEAD_RESOURCES_PINNED))
+ return -EBUSY;
+
+ /* Determine if there's anything we could potentially clean */
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (mask & EXEC_CLEAN_FDSTORE)
+ may_clean_fdstore = s->n_fd_store > 0 || s->n_fd_store_max > 0;
+
+ if (strv_isempty(l) && !may_clean_fdstore)
+ return -EUNATCH; /* Nothing to potentially clean */
+
+ /* Let's clean the stuff we can clean quickly */
+ if (may_clean_fdstore)
+ service_release_fd_store(s);
+
+ /* If we are done, leave quickly */
+ if (strv_isempty(l)) {
+ if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store)
+ service_set_state(s, SERVICE_DEAD);
+ return 0;
+ }
+
+ /* We need to clean disk stuff. This is slow, hence do it out of process, and change state */
+ service_unwatch_control_pid(s);
+ s->clean_result = SERVICE_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
+
+ r = service_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+ goto fail;
+ }
+
+ service_set_state(s, SERVICE_CLEANING);
+ return 0;
+
+fail:
+ s->clean_result = SERVICE_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int service_can_clean(Unit *u, ExecCleanMask *ret) {
+ Service *s = SERVICE(u);
+ ExecCleanMask mask = 0;
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ r = exec_context_get_clean_mask(&s->exec_context, &mask);
+ if (r < 0)
+ return r;
+
+ if (s->n_fd_store_max > 0)
+ mask |= EXEC_CLEAN_FDSTORE;
+
+ *ret = mask;
+ return 0;
+}
+
+static const char *service_finished_job(Unit *u, JobType t, JobResult result) {
+ if (t == JOB_START &&
+ result == JOB_DONE &&
+ SERVICE(u)->type == SERVICE_ONESHOT)
+ return "Finished %s.";
+
+ /* Fall back to generic */
+ return NULL;
+}
+
+static int service_can_start(Unit *u) {
+ Service *s = SERVICE(u);
+ int r;
+
+ assert(s);
+
+ /* Make sure we don't enter a busy loop of some kind. */
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ service_enter_dead(s, SERVICE_FAILURE_START_LIMIT_HIT, false);
+ return r;
+ }
+
+ return 1;
+}
+
+static void service_release_resources(Unit *u) {
+ Service *s = SERVICE(ASSERT_PTR(u));
+
+ /* Invoked by the unit state engine, whenever it realizes that unit is dead and there's no job
+ * anymore for it, and it hence is a good idea to release resources */
+
+ /* Don't release resources if this is a transitionary failed/dead state
+ * (i.e. SERVICE_DEAD_BEFORE_AUTO_RESTART/SERVICE_FAILED_BEFORE_AUTO_RESTART), insist on a permanent
+ * failure state. */
+ if (!IN_SET(s->state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_DEAD_RESOURCES_PINNED))
+ return;
+
+ log_unit_debug(u, "Releasing resources...");
+
+ service_release_socket_fd(s);
+ service_release_stdio_fd(s);
+
+ if (s->fd_store_preserve_mode != EXEC_PRESERVE_YES)
+ service_release_fd_store(s);
+
+ if (s->state == SERVICE_DEAD_RESOURCES_PINNED && !s->fd_store)
+ service_set_state(s, SERVICE_DEAD);
+}
+
+static const char* const service_restart_table[_SERVICE_RESTART_MAX] = {
+ [SERVICE_RESTART_NO] = "no",
+ [SERVICE_RESTART_ON_SUCCESS] = "on-success",
+ [SERVICE_RESTART_ON_FAILURE] = "on-failure",
+ [SERVICE_RESTART_ON_ABNORMAL] = "on-abnormal",
+ [SERVICE_RESTART_ON_WATCHDOG] = "on-watchdog",
+ [SERVICE_RESTART_ON_ABORT] = "on-abort",
+ [SERVICE_RESTART_ALWAYS] = "always",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_restart, ServiceRestart);
+
+static const char* const service_restart_mode_table[_SERVICE_RESTART_MODE_MAX] = {
+ [SERVICE_RESTART_MODE_NORMAL] = "normal",
+ [SERVICE_RESTART_MODE_DIRECT] = "direct",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_restart_mode, ServiceRestartMode);
+
+static const char* const service_type_table[_SERVICE_TYPE_MAX] = {
+ [SERVICE_SIMPLE] = "simple",
+ [SERVICE_FORKING] = "forking",
+ [SERVICE_ONESHOT] = "oneshot",
+ [SERVICE_DBUS] = "dbus",
+ [SERVICE_NOTIFY] = "notify",
+ [SERVICE_NOTIFY_RELOAD] = "notify-reload",
+ [SERVICE_IDLE] = "idle",
+ [SERVICE_EXEC] = "exec",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType);
+
+static const char* const service_exit_type_table[_SERVICE_EXIT_TYPE_MAX] = {
+ [SERVICE_EXIT_MAIN] = "main",
+ [SERVICE_EXIT_CGROUP] = "cgroup",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exit_type, ServiceExitType);
+
+static const char* const service_exec_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+ [SERVICE_EXEC_CONDITION] = "ExecCondition",
+ [SERVICE_EXEC_START_PRE] = "ExecStartPre",
+ [SERVICE_EXEC_START] = "ExecStart",
+ [SERVICE_EXEC_START_POST] = "ExecStartPost",
+ [SERVICE_EXEC_RELOAD] = "ExecReload",
+ [SERVICE_EXEC_STOP] = "ExecStop",
+ [SERVICE_EXEC_STOP_POST] = "ExecStopPost",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_command, ServiceExecCommand);
+
+static const char* const service_exec_ex_command_table[_SERVICE_EXEC_COMMAND_MAX] = {
+ [SERVICE_EXEC_CONDITION] = "ExecConditionEx",
+ [SERVICE_EXEC_START_PRE] = "ExecStartPreEx",
+ [SERVICE_EXEC_START] = "ExecStartEx",
+ [SERVICE_EXEC_START_POST] = "ExecStartPostEx",
+ [SERVICE_EXEC_RELOAD] = "ExecReloadEx",
+ [SERVICE_EXEC_STOP] = "ExecStopEx",
+ [SERVICE_EXEC_STOP_POST] = "ExecStopPostEx",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_exec_ex_command, ServiceExecCommand);
+
+static const char* const notify_state_table[_NOTIFY_STATE_MAX] = {
+ [NOTIFY_UNKNOWN] = "unknown",
+ [NOTIFY_READY] = "ready",
+ [NOTIFY_RELOADING] = "reloading",
+ [NOTIFY_STOPPING] = "stopping",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(notify_state, NotifyState);
+
+static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
+ [SERVICE_SUCCESS] = "success",
+ [SERVICE_FAILURE_RESOURCES] = "resources",
+ [SERVICE_FAILURE_PROTOCOL] = "protocol",
+ [SERVICE_FAILURE_TIMEOUT] = "timeout",
+ [SERVICE_FAILURE_EXIT_CODE] = "exit-code",
+ [SERVICE_FAILURE_SIGNAL] = "signal",
+ [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
+ [SERVICE_FAILURE_WATCHDOG] = "watchdog",
+ [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [SERVICE_FAILURE_OOM_KILL] = "oom-kill",
+ [SERVICE_SKIP_CONDITION] = "exec-condition",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
+
+static const char* const service_timeout_failure_mode_table[_SERVICE_TIMEOUT_FAILURE_MODE_MAX] = {
+ [SERVICE_TIMEOUT_TERMINATE] = "terminate",
+ [SERVICE_TIMEOUT_ABORT] = "abort",
+ [SERVICE_TIMEOUT_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(service_timeout_failure_mode, ServiceTimeoutFailureMode);
+
+const UnitVTable service_vtable = {
+ .object_size = sizeof(Service),
+ .exec_context_offset = offsetof(Service, exec_context),
+ .cgroup_context_offset = offsetof(Service, cgroup_context),
+ .kill_context_offset = offsetof(Service, kill_context),
+ .exec_runtime_offset = offsetof(Service, exec_runtime),
+
+ .sections =
+ "Unit\0"
+ "Service\0"
+ "Install\0",
+ .private_section = "Service",
+
+ .can_transient = true,
+ .can_delegate = true,
+ .can_fail = true,
+ .can_set_managed_oom = true,
+
+ .init = service_init,
+ .done = service_done,
+ .load = service_load,
+ .release_resources = service_release_resources,
+
+ .coldplug = service_coldplug,
+
+ .dump = service_dump,
+
+ .start = service_start,
+ .stop = service_stop,
+ .reload = service_reload,
+
+ .can_reload = service_can_reload,
+
+ .clean = service_clean,
+ .can_clean = service_can_clean,
+
+ .freeze = unit_freeze_vtable_common,
+ .thaw = unit_thaw_vtable_common,
+
+ .serialize = service_serialize,
+ .deserialize_item = service_deserialize_item,
+
+ .active_state = service_active_state,
+ .sub_state_to_string = service_sub_state_to_string,
+
+ .will_restart = service_will_restart,
+
+ .may_gc = service_may_gc,
+
+ .sigchld_event = service_sigchld_event,
+
+ .reset_failed = service_reset_failed,
+
+ .notify_cgroup_empty = service_notify_cgroup_empty_event,
+ .notify_cgroup_oom = service_notify_cgroup_oom_event,
+ .notify_message = service_notify_message,
+
+ .main_pid = service_main_pid,
+ .control_pid = service_control_pid,
+
+ .bus_name_owner_change = service_bus_name_owner_change,
+
+ .bus_set_property = bus_service_set_property,
+ .bus_commit_properties = bus_service_commit_properties,
+
+ .get_timeout = service_get_timeout,
+ .get_timeout_start_usec = service_get_timeout_start_usec,
+ .needs_console = service_needs_console,
+ .exit_status = service_exit_status,
+ .status_text = service_status_text,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_FAILED] = "Failed to start %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Stopped %s.",
+ [JOB_FAILED] = "Stopped (with error) %s.",
+ },
+ .finished_job = service_finished_job,
+ },
+
+ .can_start = service_can_start,
+
+ .notify_plymouth = true,
+
+ .audit_start_message_type = AUDIT_SERVICE_START,
+ .audit_stop_message_type = AUDIT_SERVICE_STOP,
+};
diff --git a/src/core/service.h b/src/core/service.h
new file mode 100644
index 0000000..e85302e
--- /dev/null
+++ b/src/core/service.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Service Service;
+typedef struct ServiceFDStore ServiceFDStore;
+
+#include "exit-status.h"
+#include "kill.h"
+#include "open-file.h"
+#include "path.h"
+#include "pidref.h"
+#include "ratelimit.h"
+#include "socket.h"
+#include "unit.h"
+
+typedef enum ServiceRestart {
+ SERVICE_RESTART_NO,
+ SERVICE_RESTART_ON_SUCCESS,
+ SERVICE_RESTART_ON_FAILURE,
+ SERVICE_RESTART_ON_ABNORMAL,
+ SERVICE_RESTART_ON_WATCHDOG,
+ SERVICE_RESTART_ON_ABORT,
+ SERVICE_RESTART_ALWAYS,
+ _SERVICE_RESTART_MAX,
+ _SERVICE_RESTART_INVALID = -EINVAL,
+} ServiceRestart;
+
+typedef enum ServiceType {
+ SERVICE_SIMPLE, /* we fork and go on right-away (i.e. modern socket activated daemons) */
+ SERVICE_FORKING, /* forks by itself (i.e. traditional daemons) */
+ SERVICE_ONESHOT, /* we fork and wait until the program finishes (i.e. programs like fsck which run and need to finish before we continue) */
+ SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */
+ SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */
+ SERVICE_NOTIFY_RELOAD, /* just like SERVICE_NOTIFY, but also implements a reload protocol via SIGHUP */
+ SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */
+ SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */
+ _SERVICE_TYPE_MAX,
+ _SERVICE_TYPE_INVALID = -EINVAL,
+} ServiceType;
+
+typedef enum ServiceExitType {
+ SERVICE_EXIT_MAIN, /* we consider the main PID when deciding if the service exited */
+ SERVICE_EXIT_CGROUP, /* we wait for the last process in the cgroup to exit */
+ _SERVICE_EXIT_TYPE_MAX,
+ _SERVICE_EXIT_TYPE_INVALID = -EINVAL,
+} ServiceExitType;
+
+typedef enum ServiceExecCommand {
+ SERVICE_EXEC_CONDITION,
+ SERVICE_EXEC_START_PRE,
+ SERVICE_EXEC_START,
+ SERVICE_EXEC_START_POST,
+ SERVICE_EXEC_RELOAD,
+ SERVICE_EXEC_STOP,
+ SERVICE_EXEC_STOP_POST,
+ _SERVICE_EXEC_COMMAND_MAX,
+ _SERVICE_EXEC_COMMAND_INVALID = -EINVAL,
+} ServiceExecCommand;
+
+typedef enum NotifyState {
+ NOTIFY_UNKNOWN,
+ NOTIFY_READY,
+ NOTIFY_RELOADING,
+ NOTIFY_STOPPING,
+ _NOTIFY_STATE_MAX,
+ _NOTIFY_STATE_INVALID = -EINVAL,
+} NotifyState;
+
+/* The values of this enum are referenced in man/systemd.exec.xml and src/shared/bus-unit-util.c.
+ * Update those sources for each change to this enum. */
+typedef enum ServiceResult {
+ SERVICE_SUCCESS,
+ SERVICE_FAILURE_RESOURCES, /* a bit of a misnomer, just our catch-all error for errnos we didn't expect */
+ SERVICE_FAILURE_PROTOCOL,
+ SERVICE_FAILURE_TIMEOUT,
+ SERVICE_FAILURE_EXIT_CODE,
+ SERVICE_FAILURE_SIGNAL,
+ SERVICE_FAILURE_CORE_DUMP,
+ SERVICE_FAILURE_WATCHDOG,
+ SERVICE_FAILURE_START_LIMIT_HIT,
+ SERVICE_FAILURE_OOM_KILL, /* OOM Kill by the Kernel or systemd-oomd */
+ SERVICE_SKIP_CONDITION,
+ _SERVICE_RESULT_MAX,
+ _SERVICE_RESULT_INVALID = -EINVAL,
+} ServiceResult;
+
+typedef enum ServiceTimeoutFailureMode {
+ SERVICE_TIMEOUT_TERMINATE,
+ SERVICE_TIMEOUT_ABORT,
+ SERVICE_TIMEOUT_KILL,
+ _SERVICE_TIMEOUT_FAILURE_MODE_MAX,
+ _SERVICE_TIMEOUT_FAILURE_MODE_INVALID = -EINVAL,
+} ServiceTimeoutFailureMode;
+
+typedef enum ServiceRestartMode {
+ SERVICE_RESTART_MODE_NORMAL,
+ SERVICE_RESTART_MODE_DIRECT,
+ _SERVICE_RESTART_MODE_MAX,
+ _SERVICE_RESTART_MODE_INVALID = -EINVAL,
+} ServiceRestartMode;
+
+struct ServiceFDStore {
+ Service *service;
+
+ int fd;
+ char *fdname;
+ sd_event_source *event_source;
+ bool do_poll;
+
+ LIST_FIELDS(ServiceFDStore, fd_store);
+};
+
+struct Service {
+ Unit meta;
+
+ ServiceType type;
+ ServiceExitType exit_type;
+ ServiceRestart restart;
+ ServiceRestartMode restart_mode;
+ ExitStatusSet restart_prevent_status;
+ ExitStatusSet restart_force_status;
+ ExitStatusSet success_status;
+
+ /* If set we'll read the main daemon PID from this file */
+ char *pid_file;
+
+ usec_t restart_usec;
+ unsigned restart_steps;
+ usec_t restart_max_delay_usec;
+ usec_t timeout_start_usec;
+ usec_t timeout_stop_usec;
+ usec_t timeout_abort_usec;
+ bool timeout_abort_set;
+ usec_t runtime_max_usec;
+ usec_t runtime_rand_extra_usec;
+ ServiceTimeoutFailureMode timeout_start_failure_mode;
+ ServiceTimeoutFailureMode timeout_stop_failure_mode;
+
+ dual_timestamp watchdog_timestamp;
+ usec_t watchdog_usec; /* the requested watchdog timeout in the unit file */
+ usec_t watchdog_original_usec; /* the watchdog timeout that was in effect when the unit was started, i.e. the timeout the forked off processes currently see */
+ usec_t watchdog_override_usec; /* the watchdog timeout requested by the service itself through sd_notify() */
+ bool watchdog_override_enable;
+ sd_event_source *watchdog_event_source;
+
+ ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX];
+
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ServiceState state, deserialized_state;
+
+ /* The exit status of the real main process */
+ ExecStatus main_exec_status;
+
+ /* The currently executed control process */
+ ExecCommand *control_command;
+
+ /* The currently executed main process, which may be NULL if
+ * the main process got started via forking mode and not by
+ * us */
+ ExecCommand *main_command;
+
+ /* The ID of the control command currently being executed */
+ ServiceExecCommand control_command_id;
+
+ /* Runtime data of the execution context */
+ ExecRuntime *exec_runtime;
+
+ PidRef main_pid, control_pid;
+
+ /* if we are a socket activated service instance, store information of the connection/peer/socket */
+ int socket_fd;
+ SocketPeer *socket_peer;
+ UnitRef accept_socket;
+ bool socket_fd_selinux_context_net;
+
+ bool permissions_start_only;
+ bool root_directory_start_only;
+ bool remain_after_exit;
+ bool guess_main_pid;
+
+ /* If we shut down, remember why */
+ ServiceResult result;
+ ServiceResult reload_result;
+ ServiceResult clean_result;
+
+ bool main_pid_known:1;
+ bool main_pid_alien:1;
+ bool bus_name_good:1;
+ bool forbid_restart:1;
+ bool start_timeout_defined:1;
+ bool exec_fd_hot:1;
+
+ char *bus_name;
+ char *bus_name_owner; /* unique name of the current owner */
+
+ char *status_text;
+ int status_errno;
+
+ sd_event_source *timer_event_source;
+ PathSpec *pid_file_pathspec;
+
+ NotifyAccess notify_access;
+ NotifyAccess notify_access_override;
+ NotifyState notify_state;
+
+ sd_bus_slot *bus_name_pid_lookup_slot;
+
+ sd_event_source *exec_fd_event_source;
+
+ ServiceFDStore *fd_store;
+ size_t n_fd_store;
+ unsigned n_fd_store_max;
+ ExecPreserveMode fd_store_preserve_mode;
+
+ char *usb_function_descriptors;
+ char *usb_function_strings;
+
+ int stdin_fd;
+ int stdout_fd;
+ int stderr_fd;
+
+ unsigned n_restarts;
+ bool flush_n_restarts;
+
+ OOMPolicy oom_policy;
+
+ LIST_HEAD(OpenFile, open_files);
+
+ int reload_signal;
+ usec_t reload_begin_usec;
+};
+
+static inline usec_t service_timeout_abort_usec(Service *s) {
+ assert(s);
+ return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec;
+}
+
+static inline NotifyAccess service_get_notify_access(Service *s) {
+ assert(s);
+ return s->notify_access_override < 0 ? s->notify_access : s->notify_access_override;
+}
+
+static inline usec_t service_get_watchdog_usec(Service *s) {
+ assert(s);
+ return s->watchdog_override_enable ? s->watchdog_override_usec : s->watchdog_original_usec;
+}
+
+extern const UnitVTable service_vtable;
+
+int service_set_socket_fd(Service *s, int fd, struct Socket *socket, struct SocketPeer *peer, bool selinux_context_net);
+void service_release_socket_fd(Service *s);
+
+usec_t service_restart_usec_next(Service *s);
+
+const char* service_restart_to_string(ServiceRestart i) _const_;
+ServiceRestart service_restart_from_string(const char *s) _pure_;
+
+const char* service_restart_mode_to_string(ServiceRestartMode i) _const_;
+ServiceRestartMode service_restart_mode_from_string(const char *s) _pure_;
+
+const char* service_type_to_string(ServiceType i) _const_;
+ServiceType service_type_from_string(const char *s) _pure_;
+
+const char* service_exit_type_to_string(ServiceExitType i) _const_;
+ServiceExitType service_exit_type_from_string(const char *s) _pure_;
+
+const char* service_exec_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_command_from_string(const char *s) _pure_;
+
+const char* service_exec_ex_command_to_string(ServiceExecCommand i) _const_;
+ServiceExecCommand service_exec_ex_command_from_string(const char *s) _pure_;
+
+const char* notify_state_to_string(NotifyState i) _const_;
+NotifyState notify_state_from_string(const char *s) _pure_;
+
+const char* service_result_to_string(ServiceResult i) _const_;
+ServiceResult service_result_from_string(const char *s) _pure_;
+
+const char* service_timeout_failure_mode_to_string(ServiceTimeoutFailureMode i) _const_;
+ServiceTimeoutFailureMode service_timeout_failure_mode_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SERVICE, Service);
+
+#define STATUS_TEXT_MAX (16U*1024U)
+
+/* Only exported for unit tests */
+int service_deserialize_exec_command(Unit *u, const char *key, const char *value);
diff --git a/src/core/show-status.c b/src/core/show-status.c
new file mode 100644
index 0000000..606237e
--- /dev/null
+++ b/src/core/show-status.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "iovec-util.h"
+#include "parse-util.h"
+#include "show-status.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "terminal-util.h"
+
+static const char* const show_status_table[_SHOW_STATUS_MAX] = {
+ [SHOW_STATUS_NO] = "no",
+ [SHOW_STATUS_ERROR] = "error",
+ [SHOW_STATUS_AUTO] = "auto",
+ [SHOW_STATUS_TEMPORARY] = "temporary",
+ [SHOW_STATUS_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(show_status, ShowStatus, SHOW_STATUS_YES);
+
+int parse_show_status(const char *v, ShowStatus *ret) {
+ ShowStatus s;
+
+ assert(ret);
+
+ s = show_status_from_string(v);
+ if (s < 0 || s == SHOW_STATUS_TEMPORARY)
+ return -EINVAL;
+
+ *ret = s;
+ return 0;
+}
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) {
+ static const char status_indent[] = " "; /* "[" STATUS "] " */
+ _cleanup_free_ char *s = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ struct iovec iovec[7] = {};
+ int n = 0;
+ static bool prev_ephemeral;
+
+ assert(format);
+
+ /* This is independent of logging, as status messages are
+ * optional and go exclusively to the console. */
+
+ if (vasprintf(&s, format, ap) < 0)
+ return log_oom();
+
+ /* Before you ask: yes, on purpose we open/close the console for each status line we write individually. This
+ * is a good strategy to avoid PID 1 getting killed by the kernel's SAK concept (it doesn't fix this entirely,
+ * but minimizes the time window the kernel might end up killing PID 1 due to SAK). It also makes things easier
+ * for us so that we don't have to recover from hangups and suchlike triggered on the console. */
+
+ fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ if (FLAGS_SET(flags, SHOW_STATUS_ELLIPSIZE)) {
+ char *e;
+ size_t emax, sl;
+ int c;
+
+ c = fd_columns(fd);
+ if (c <= 0)
+ c = 80;
+
+ sl = status ? sizeof(status_indent)-1 : 0;
+
+ emax = c - sl - 1;
+ if (emax < 3)
+ emax = 3;
+
+ e = ellipsize(s, emax, 50);
+ if (e)
+ free_and_replace(s, e);
+ }
+
+ if (prev_ephemeral)
+ iovec[n++] = IOVEC_MAKE_STRING(ANSI_REVERSE_LINEFEED "\r" ANSI_ERASE_TO_END_OF_LINE);
+
+ if (status) {
+ if (!isempty(status)) {
+ iovec[n++] = IOVEC_MAKE_STRING("[");
+ iovec[n++] = IOVEC_MAKE_STRING(status);
+ iovec[n++] = IOVEC_MAKE_STRING("] ");
+ } else
+ iovec[n++] = IOVEC_MAKE_STRING(status_indent);
+ }
+
+ iovec[n++] = IOVEC_MAKE_STRING(s);
+ iovec[n++] = IOVEC_MAKE_STRING("\r\n"); /* use CRNL instead of just NL, to be robust towards TTYs in raw mode */
+
+ if (prev_ephemeral && !FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL))
+ iovec[n++] = IOVEC_MAKE_STRING(ANSI_ERASE_TO_END_OF_LINE);
+ prev_ephemeral = FLAGS_SET(flags, SHOW_STATUS_EPHEMERAL);
+
+ if (writev(fd, iovec, n) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) {
+ va_list ap;
+ int r;
+
+ assert(format);
+
+ va_start(ap, format);
+ r = status_vprintf(status, flags, format, ap);
+ va_end(ap);
+
+ return r;
+}
+
+static const char* const status_unit_format_table[_STATUS_UNIT_FORMAT_MAX] = {
+ [STATUS_UNIT_FORMAT_NAME] = "name",
+ [STATUS_UNIT_FORMAT_DESCRIPTION] = "description",
+ [STATUS_UNIT_FORMAT_COMBINED] = "combined",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(status_unit_format, StatusUnitFormat);
diff --git a/src/core/show-status.h b/src/core/show-status.h
new file mode 100644
index 0000000..f441223
--- /dev/null
+++ b/src/core/show-status.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+/* Manager status */
+
+typedef enum ShowStatus {
+ SHOW_STATUS_NO, /* printing of status is disabled */
+ SHOW_STATUS_ERROR, /* only print errors */
+ SHOW_STATUS_AUTO, /* disabled but may flip to _TEMPORARY */
+ SHOW_STATUS_TEMPORARY, /* enabled temporarily, may flip back to _AUTO */
+ SHOW_STATUS_YES, /* printing of status is enabled */
+ _SHOW_STATUS_MAX,
+ _SHOW_STATUS_INVALID = -EINVAL,
+} ShowStatus;
+
+typedef enum ShowStatusFlags {
+ SHOW_STATUS_ELLIPSIZE = 1 << 0,
+ SHOW_STATUS_EPHEMERAL = 1 << 1,
+} ShowStatusFlags;
+
+typedef enum StatusUnitFormat {
+ STATUS_UNIT_FORMAT_NAME,
+ STATUS_UNIT_FORMAT_DESCRIPTION,
+ STATUS_UNIT_FORMAT_COMBINED,
+ _STATUS_UNIT_FORMAT_MAX,
+ _STATUS_UNIT_FORMAT_INVALID = -EINVAL,
+} StatusUnitFormat;
+
+static inline bool show_status_on(ShowStatus s) {
+ return IN_SET(s, SHOW_STATUS_TEMPORARY, SHOW_STATUS_YES);
+}
+ShowStatus show_status_from_string(const char *v) _const_;
+const char* show_status_to_string(ShowStatus s) _pure_;
+int parse_show_status(const char *v, ShowStatus *ret);
+
+StatusUnitFormat status_unit_format_from_string(const char *v) _const_;
+const char* status_unit_format_to_string(StatusUnitFormat s) _pure_;
+
+int status_vprintf(const char *status, ShowStatusFlags flags, const char *format, va_list ap) _printf_(3,0);
+int status_printf(const char *status, ShowStatusFlags flags, const char *format, ...) _printf_(3,4);
diff --git a/src/core/slice.c b/src/core/slice.c
new file mode 100644
index 0000000..fb4f23c
--- /dev/null
+++ b/src/core/slice.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "dbus-slice.h"
+#include "dbus-unit.h"
+#include "fd-util.h"
+#include "log.h"
+#include "serialize.h"
+#include "slice.h"
+#include "special.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
+ [SLICE_DEAD] = UNIT_INACTIVE,
+ [SLICE_ACTIVE] = UNIT_ACTIVE
+};
+
+static void slice_init(Unit *u) {
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ u->ignore_on_isolate = true;
+}
+
+static void slice_set_state(Slice *t, SliceState state) {
+ SliceState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s",
+ UNIT(t)->id,
+ slice_state_to_string(old_state),
+ slice_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int slice_add_parent_slice(Slice *s) {
+ Unit *u = UNIT(s);
+ _cleanup_free_ char *a = NULL;
+ int r;
+
+ assert(s);
+
+ if (UNIT_GET_SLICE(u))
+ return 0;
+
+ r = slice_build_parent_slice(u->id, &a);
+ if (r <= 0) /* 0 means root slice */
+ return r;
+
+ return unit_add_dependency_by_name(u, UNIT_IN_SLICE, a, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int slice_add_default_dependencies(Slice *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ /* Make sure slices are unloaded on shutdown */
+ r = unit_add_two_dependencies_by_name(
+ UNIT(s),
+ UNIT_BEFORE, UNIT_CONFLICTS,
+ SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int slice_verify(Slice *s) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (!slice_name_is_valid(UNIT(s)->id))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Slice name %s is not valid. Refusing.", UNIT(s)->id);
+
+ r = slice_build_parent_slice(UNIT(s)->id, &parent);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to determine parent slice: %m");
+
+ /* If recursive errors are to be ignored, the parent slice should not be verified */
+ if (UNIT(s)->manager && FLAGS_SET(UNIT(s)->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ if (parent ? !unit_has_name(UNIT_GET_SLICE(UNIT(s)), parent) : !!UNIT_GET_SLICE(UNIT(s)))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Located outside of parent slice. Refusing.");
+
+ return 0;
+}
+
+static int slice_load_root_slice(Unit *u) {
+ assert(u);
+
+ if (!unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ u->perpetual = true;
+
+ /* The root slice is a bit special. For example it is always running and cannot be terminated. Because of its
+ * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ if (!u->description)
+ u->description = strdup("Root Slice");
+ if (!u->documentation)
+ u->documentation = strv_new("man:systemd.special(7)");
+
+ return 1;
+}
+
+static int slice_load_system_slice(Unit *u) {
+ assert(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+ if (!unit_has_name(u, SPECIAL_SYSTEM_SLICE))
+ return 0;
+
+ u->perpetual = true;
+
+ /* The system slice is a bit special. For example it is always running and cannot be terminated. Because of its
+ * special semantics we synthesize it here, instead of relying on the unit file on disk. */
+
+ u->default_dependencies = false;
+
+ if (!u->description)
+ u->description = strdup("System Slice");
+ if (!u->documentation)
+ u->documentation = strv_new("man:systemd.special(7)");
+
+ return 1;
+}
+
+static int slice_load(Unit *u) {
+ Slice *s = SLICE(u);
+ int r;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ r = slice_load_root_slice(u);
+ if (r < 0)
+ return r;
+ r = slice_load_system_slice(u);
+ if (r < 0)
+ return r;
+
+ r = unit_load_fragment_and_dropin(u, false);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ r = slice_add_parent_slice(s);
+ if (r < 0)
+ return r;
+
+ r = slice_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ if (!u->description) {
+ _cleanup_free_ char *tmp = NULL;
+
+ r = unit_name_to_path(u->id, &tmp);
+ if (r >= 0) /* Failure is ignored… */
+ u->description = strjoin("Slice ", tmp);
+ }
+
+ return slice_verify(s);
+}
+
+static int slice_coldplug(Unit *u) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(t->state == SLICE_DEAD);
+
+ if (t->deserialized_state != t->state)
+ slice_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void slice_dump(Unit *u, FILE *f, const char *prefix) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(f);
+
+ fprintf(f,
+ "%sSlice State: %s\n",
+ prefix, slice_state_to_string(t->state));
+
+ cgroup_context_dump(UNIT(t), f, prefix);
+}
+
+static int slice_start(Unit *u) {
+ Slice *t = SLICE(u);
+ int r;
+
+ assert(t);
+ assert(t->state == SLICE_DEAD);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ (void) unit_realize_cgroup(u);
+ (void) unit_reset_accounting(u);
+
+ slice_set_state(t, SLICE_ACTIVE);
+ return 1;
+}
+
+static int slice_stop(Unit *u) {
+ Slice *t = SLICE(u);
+
+ assert(t);
+ assert(t->state == SLICE_ACTIVE);
+
+ /* We do not need to destroy the cgroup explicitly,
+ * unit_notify() will do that for us anyway. */
+
+ slice_set_state(t, SLICE_DEAD);
+ return 1;
+}
+
+static int slice_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Slice *s = SLICE(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", slice_state_to_string(s->state));
+
+ return 0;
+}
+
+static int slice_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Slice *s = SLICE(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ SliceState state;
+
+ state = slice_state_from_string(value);
+ if (state < 0)
+ log_debug("Failed to parse state value %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else
+ log_debug("Unknown serialization key '%s'", key);
+
+ return 0;
+}
+
+static UnitActiveState slice_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SLICE(u)->state];
+}
+
+static const char *slice_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return slice_state_to_string(SLICE(u)->state);
+}
+
+static int slice_make_perpetual(Manager *m, const char *name, Unit **ret) {
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(name);
+
+ u = manager_get_unit(m, name);
+ if (!u) {
+ r = unit_new_for_name(m, sizeof(Slice), name, &u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate the special %s unit: %m", name);
+ }
+
+ u->perpetual = true;
+ SLICE(u)->deserialized_state = SLICE_ACTIVE;
+
+ unit_add_to_load_queue(u);
+ unit_add_to_dbus_queue(u);
+
+ if (ret)
+ *ret = u;
+
+ return 0;
+}
+
+static void slice_enumerate_perpetual(Manager *m) {
+ Unit *u;
+ int r;
+
+ assert(m);
+
+ r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u);
+ if (r >= 0 && manager_owns_host_root_cgroup(m)) {
+ Slice *s = SLICE(u);
+
+ /* If we are managing the root cgroup then this means our root slice covers the whole system, which
+ * means the kernel will track CPU/tasks/memory for us anyway, and it is all available in /proc. Let's
+ * hence turn accounting on here, so that our APIs to query this data are available. */
+
+ s->cgroup_context.cpu_accounting = true;
+ s->cgroup_context.tasks_accounting = true;
+ s->cgroup_context.memory_accounting = true;
+ }
+
+ if (MANAGER_IS_SYSTEM(m))
+ (void) slice_make_perpetual(m, SPECIAL_SYSTEM_SLICE, NULL);
+}
+
+static bool slice_freezer_action_supported_by_children(Unit *s) {
+ Unit *member;
+
+ assert(s);
+
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+
+ if (member->type == UNIT_SLICE &&
+ !slice_freezer_action_supported_by_children(member))
+ return false;
+
+ if (!UNIT_VTABLE(member)->freeze)
+ return false;
+ }
+
+ return true;
+}
+
+static int slice_freezer_action(Unit *s, FreezerAction action) {
+ Unit *member;
+ int r;
+
+ assert(s);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ if (action == FREEZER_FREEZE && !slice_freezer_action_supported_by_children(s)) {
+ log_unit_warning(s, "Requested freezer operation is not supported by all children of the slice");
+ return 0;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(member, s, UNIT_ATOM_SLICE_OF) {
+ if (!member->cgroup_realized)
+ continue;
+
+ if (action == FREEZER_FREEZE)
+ r = UNIT_VTABLE(member)->freeze(member);
+ else if (UNIT_VTABLE(member)->thaw)
+ r = UNIT_VTABLE(member)->thaw(member);
+ else
+ /* Thawing is requested but no corresponding method is available, ignore. */
+ r = 0;
+ if (r < 0)
+ return r;
+ }
+
+ return unit_cgroup_freezer_action(s, action);
+}
+
+static int slice_freeze(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action(s, FREEZER_FREEZE);
+}
+
+static int slice_thaw(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action(s, FREEZER_THAW);
+}
+
+static bool slice_can_freeze(Unit *s) {
+ assert(s);
+
+ return slice_freezer_action_supported_by_children(s);
+}
+
+const UnitVTable slice_vtable = {
+ .object_size = sizeof(Slice),
+ .cgroup_context_offset = offsetof(Slice, cgroup_context),
+
+ .sections =
+ "Unit\0"
+ "Slice\0"
+ "Install\0",
+ .private_section = "Slice",
+
+ .can_transient = true,
+ .can_set_managed_oom = true,
+
+ .init = slice_init,
+ .load = slice_load,
+
+ .coldplug = slice_coldplug,
+
+ .dump = slice_dump,
+
+ .start = slice_start,
+ .stop = slice_stop,
+
+ .freeze = slice_freeze,
+ .thaw = slice_thaw,
+ .can_freeze = slice_can_freeze,
+
+ .serialize = slice_serialize,
+ .deserialize_item = slice_deserialize_item,
+
+ .active_state = slice_active_state,
+ .sub_state_to_string = slice_sub_state_to_string,
+
+ .bus_set_property = bus_slice_set_property,
+ .bus_commit_properties = bus_slice_commit_properties,
+
+ .enumerate_perpetual = slice_enumerate_perpetual,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Created slice %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Removed slice %s.",
+ },
+ },
+};
diff --git a/src/core/slice.h b/src/core/slice.h
new file mode 100644
index 0000000..e2f9274
--- /dev/null
+++ b/src/core/slice.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Slice Slice;
+
+struct Slice {
+ Unit meta;
+
+ SliceState state, deserialized_state;
+
+ CGroupContext cgroup_context;
+};
+
+extern const UnitVTable slice_vtable;
+
+DEFINE_CAST(SLICE, Slice);
diff --git a/src/core/smack-setup.c b/src/core/smack-setup.c
new file mode 100644
index 0000000..7ea902b
--- /dev/null
+++ b/src/core/smack-setup.c
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2013 Intel Corporation
+ Authors:
+ Nathaniel Chen <nathaniel.chen@intel.com>
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "macro.h"
+#include "smack-setup.h"
+#include "string-util.h"
+
+#if ENABLE_SMACK
+
+static int fdopen_unlocked_at(int dfd, const char *dir, const char *name, int *status, FILE **ret_file) {
+ int fd, r;
+ FILE *f;
+
+ fd = openat(dfd, name, O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (*status == 0)
+ *status = -errno;
+
+ return log_warning_errno(errno, "Failed to open \"%s/%s\": %m", dir, name);
+ }
+
+ r = fdopen_unlocked(fd, "r", &f);
+ if (r < 0) {
+ if (*status == 0)
+ *status = r;
+
+ safe_close(fd);
+ return log_error_errno(r, "Failed to open \"%s/%s\": %m", dir, name);
+ }
+
+ *ret_file = f;
+ return 0;
+}
+
+static int write_access2_rules(const char *srcdir) {
+ _cleanup_close_ int load2_fd = -EBADF, change_fd = -EBADF;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -EBADF, r = 0;
+
+ load2_fd = open("/sys/fs/smackfs/load2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (load2_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/load2': %m");
+ return -errno; /* negative error */
+ }
+
+ change_fd = open("/sys/fs/smackfs/change-rule", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (change_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/change-rule': %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to load2 or change-rule from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (!dirent_is_file(entry))
+ continue;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* load2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL, *sbj = NULL, *obj = NULL, *acc1 = NULL, *acc2 = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, buf[0]))
+ continue;
+
+ /* if 3 args -> load rule : subject object access1 */
+ /* if 4 args -> change rule : subject object access1 access2 */
+ if (sscanf(buf, "%ms %ms %ms %ms", &sbj, &obj, &acc1, &acc2) < 3) {
+ log_error_errno(errno, "Failed to parse rule '%s' in '%s', ignoring.", buf, entry->d_name);
+ continue;
+ }
+
+ if (write(isempty(acc2) ? load2_fd : change_fd, buf, strlen(buf)) < 0) {
+ if (r == 0)
+ r = -errno;
+ log_error_errno(errno, "Failed to write '%s' to '%s' in '%s': %m",
+ buf, isempty(acc2) ? "/sys/fs/smackfs/load2" : "/sys/fs/smackfs/change-rule", entry->d_name);
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_cipso2_rules(const char *srcdir) {
+ _cleanup_close_ int cipso2_fd = -EBADF;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -EBADF, r = 0;
+
+ cipso2_fd = open("/sys/fs/smackfs/cipso2", O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (cipso2_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/cipso2': %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to cipso2 from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir '%s': %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (!dirent_is_file(entry))
+ continue;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* cipso2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from '%s': %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, buf[0]))
+ continue;
+
+ if (write(cipso2_fd, buf, strlen(buf)) < 0) {
+ if (r == 0)
+ r = -errno;
+ log_error_errno(errno, "Failed to write '%s' to '/sys/fs/smackfs/cipso2' in '%s': %m",
+ buf, entry->d_name);
+ break;
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_netlabel_rules(const char *srcdir) {
+ _cleanup_fclose_ FILE *dst = NULL;
+ _cleanup_closedir_ DIR *dir = NULL;
+ int dfd = -EBADF, r = 0;
+
+ dst = fopen("/sys/fs/smackfs/netlabel", "we");
+ if (!dst) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open /sys/fs/smackfs/netlabel: %m");
+ return -errno; /* negative error */
+ }
+
+ /* write rules to dst from every file in the directory */
+ dir = opendir(srcdir);
+ if (!dir) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to opendir %s: %m", srcdir);
+ return errno; /* positive on purpose */
+ }
+
+ dfd = dirfd(dir);
+ assert(dfd >= 0);
+
+ FOREACH_DIRENT(entry, dir, return 0) {
+ _cleanup_fclose_ FILE *policy = NULL;
+
+ if (fdopen_unlocked_at(dfd, srcdir, entry->d_name, &r, &policy) < 0)
+ continue;
+
+ /* load2 write rules in the kernel require a line buffered stream */
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ int q;
+
+ q = read_line(policy, NAME_MAX, &buf);
+ if (q < 0)
+ return log_error_errno(q, "Failed to read line from %s: %m", entry->d_name);
+ if (q == 0)
+ break;
+
+ if (!fputs(buf, dst)) {
+ if (r == 0)
+ r = -EINVAL;
+ log_error_errno(errno, "Failed to write line to /sys/fs/smackfs/netlabel: %m");
+ break;
+ }
+ q = fflush_and_check(dst);
+ if (q < 0) {
+ if (r == 0)
+ r = q;
+ log_error_errno(q, "Failed to flush writes to /sys/fs/smackfs/netlabel: %m");
+ break;
+ }
+ }
+ }
+
+ return r;
+}
+
+static int write_onlycap_list(void) {
+ _cleanup_close_ int onlycap_fd = -EBADF;
+ _cleanup_free_ char *list = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ size_t len = 0;
+ int r;
+
+ f = fopen("/etc/smack/onlycap", "re");
+ if (!f) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to read '/etc/smack/onlycap': %m");
+
+ return errno == ENOENT ? ENOENT : -errno;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ size_t l;
+
+ r = read_line(f, LONG_LINE_MAX, &buf);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read line from /etc/smack/onlycap: %m");
+ if (r == 0)
+ break;
+
+ if (isempty(buf) || strchr(COMMENTS, *buf))
+ continue;
+
+ l = strlen(buf);
+ if (!GREEDY_REALLOC(list, len + l + 1))
+ return log_oom();
+
+ stpcpy(list + len, buf)[0] = ' ';
+ len += l + 1;
+ }
+
+ if (len == 0)
+ return 0;
+
+ list[len - 1] = 0;
+
+ onlycap_fd = open("/sys/fs/smackfs/onlycap", O_WRONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (onlycap_fd < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Failed to open '/sys/fs/smackfs/onlycap': %m");
+ return -errno; /* negative error */
+ }
+
+ r = write(onlycap_fd, list, len);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to write onlycap list(%s) to '/sys/fs/smackfs/onlycap': %m", list);
+
+ return 0;
+}
+
+#endif
+
+int mac_smack_setup(bool *loaded_policy) {
+
+#if ENABLE_SMACK
+
+ int r;
+
+ assert(loaded_policy);
+
+ r = write_access2_rules("/etc/smack/accesses.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack access rules directory '/etc/smack/accesses.d/' not found");
+ return 0;
+ case 0:
+ log_info("Successfully loaded Smack policies.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack access rules, ignoring: %m");
+ return 0;
+ }
+
+#if HAVE_SMACK_RUN_LABEL
+ r = write_string_file("/proc/self/attr/current", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK label \"" SMACK_RUN_LABEL "\" on self: %m");
+ r = write_string_file("/sys/fs/smackfs/ambient", SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK ambient label \"" SMACK_RUN_LABEL "\": %m");
+ r = write_string_file("/sys/fs/smackfs/netlabel",
+ "0.0.0.0/0 " SMACK_RUN_LABEL, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK netlabel rule \"0.0.0.0/0 " SMACK_RUN_LABEL "\": %m");
+ r = write_string_file("/sys/fs/smackfs/netlabel", "127.0.0.1 -CIPSO", WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SMACK netlabel rule \"127.0.0.1 -CIPSO\": %m");
+#endif
+
+ r = write_cipso2_rules("/etc/smack/cipso.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack/CIPSO is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack/CIPSO access rules directory '/etc/smack/cipso.d/' not found");
+ break;
+ case 0:
+ log_info("Successfully loaded Smack/CIPSO policies.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack/CIPSO access rules, ignoring: %m");
+ break;
+ }
+
+ r = write_netlabel_rules("/etc/smack/netlabel.d/");
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack/CIPSO is not enabled in the kernel.");
+ return 0;
+ case ENOENT:
+ log_debug("Smack network host rules directory '/etc/smack/netlabel.d/' not found");
+ break;
+ case 0:
+ log_info("Successfully loaded Smack network host rules.");
+ break;
+ default:
+ log_warning_errno(r, "Failed to load Smack network host rules: %m, ignoring.");
+ break;
+ }
+
+ r = write_onlycap_list();
+ switch (r) {
+ case -ENOENT:
+ log_debug("Smack is not enabled in the kernel.");
+ break;
+ case ENOENT:
+ log_debug("Smack onlycap list file '/etc/smack/onlycap' not found");
+ break;
+ case 0:
+ log_info("Successfully wrote Smack onlycap list.");
+ break;
+ default:
+ return log_struct_errno(LOG_EMERG, r,
+ LOG_MESSAGE("Failed to write Smack onlycap list: %m"),
+ "MESSAGE_ID=" SD_MESSAGE_SMACK_FAILED_WRITE_STR);
+ }
+
+ *loaded_policy = true;
+
+#endif
+
+ return 0;
+}
diff --git a/src/core/smack-setup.h b/src/core/smack-setup.h
new file mode 100644
index 0000000..d29370d
--- /dev/null
+++ b/src/core/smack-setup.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2013 Intel Corporation
+ Authors:
+ Nathaniel Chen <nathaniel.chen@intel.com>
+***/
+
+int mac_smack_setup(bool *loaded_policy);
diff --git a/src/core/socket.c b/src/core/socket.c
new file mode 100644
index 0000000..388be62
--- /dev/null
+++ b/src/core/socket.c
@@ -0,0 +1,3617 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <mqueue.h>
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/sctp.h>
+
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "chase.h"
+#include "constants.h"
+#include "copy.h"
+#include "dbus-socket.h"
+#include "dbus-unit.h"
+#include "errno-list.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "in-addr-util.h"
+#include "io-util.h"
+#include "ip-protocol-list.h"
+#include "label-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "service.h"
+#include "signal-util.h"
+#include "smack-util.h"
+#include "socket.h"
+#include "socket-netlink.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+
+struct SocketPeer {
+ unsigned n_ref;
+
+ Socket *socket;
+ union sockaddr_union peer;
+ socklen_t peer_salen;
+};
+
+static const UnitActiveState state_translation_table[_SOCKET_STATE_MAX] = {
+ [SOCKET_DEAD] = UNIT_INACTIVE,
+ [SOCKET_START_PRE] = UNIT_ACTIVATING,
+ [SOCKET_START_CHOWN] = UNIT_ACTIVATING,
+ [SOCKET_START_POST] = UNIT_ACTIVATING,
+ [SOCKET_LISTENING] = UNIT_ACTIVE,
+ [SOCKET_RUNNING] = UNIT_ACTIVE,
+ [SOCKET_STOP_PRE] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_PRE_SIGTERM] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_PRE_SIGKILL] = UNIT_DEACTIVATING,
+ [SOCKET_STOP_POST] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGTERM] = UNIT_DEACTIVATING,
+ [SOCKET_FINAL_SIGKILL] = UNIT_DEACTIVATING,
+ [SOCKET_FAILED] = UNIT_FAILED,
+ [SOCKET_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static void flush_ports(Socket *s);
+
+static void socket_init(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ s->backlog = SOMAXCONN_DELUXE;
+ s->timeout_usec = u->manager->defaults.timeout_start_usec;
+ s->directory_mode = 0755;
+ s->socket_mode = 0666;
+
+ s->max_connections = 64;
+
+ s->priority = -1;
+ s->ip_tos = -1;
+ s->ip_ttl = -1;
+ s->mark = -1;
+
+ s->exec_context.std_output = u->manager->defaults.std_output;
+ s->exec_context.std_error = u->manager->defaults.std_error;
+
+ s->control_pid = PIDREF_NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ s->trigger_limit = RATELIMIT_OFF;
+
+ s->poll_limit_interval = USEC_INFINITY;
+ s->poll_limit_burst = UINT_MAX;
+}
+
+static void socket_unwatch_control_pid(Socket *s) {
+ assert(s);
+
+ if (!pidref_is_set(&s->control_pid))
+ return;
+
+ unit_unwatch_pidref(UNIT(s), &s->control_pid);
+ pidref_done(&s->control_pid);
+}
+
+static void socket_cleanup_fd_list(SocketPort *p) {
+ assert(p);
+
+ close_many(p->auxiliary_fds, p->n_auxiliary_fds);
+ p->auxiliary_fds = mfree(p->auxiliary_fds);
+ p->n_auxiliary_fds = 0;
+}
+
+SocketPort *socket_port_free(SocketPort *p) {
+ if (!p)
+ return NULL;
+
+ sd_event_source_unref(p->event_source);
+
+ socket_cleanup_fd_list(p);
+ safe_close(p->fd);
+ free(p->path);
+
+ return mfree(p);
+}
+
+void socket_free_ports(Socket *s) {
+ assert(s);
+
+ LIST_CLEAR(port, s->ports, socket_port_free);
+}
+
+static void socket_done(Unit *u) {
+ Socket *s = SOCKET(u);
+ SocketPeer *p;
+
+ assert(s);
+
+ socket_free_ports(s);
+
+ while ((p = set_steal_first(s->peers_by_address)))
+ p->socket = NULL;
+
+ s->peers_by_address = set_free(s->peers_by_address);
+
+ s->exec_runtime = exec_runtime_free(s->exec_runtime);
+ exec_command_free_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+
+ socket_unwatch_control_pid(s);
+
+ unit_ref_unset(&s->service);
+
+ s->tcp_congestion = mfree(s->tcp_congestion);
+ s->bind_to_device = mfree(s->bind_to_device);
+
+ s->smack = mfree(s->smack);
+ s->smack_ip_in = mfree(s->smack_ip_in);
+ s->smack_ip_out = mfree(s->smack_ip_out);
+
+ strv_free(s->symlinks);
+
+ s->user = mfree(s->user);
+ s->group = mfree(s->group);
+
+ s->fdname = mfree(s->fdname);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int socket_arm_timer(Socket *s, bool relative, usec_t usec) {
+ assert(s);
+
+ return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, socket_dispatch_timer);
+}
+
+static bool have_non_accept_socket(Socket *s) {
+ assert(s);
+
+ if (!s->accept)
+ return true;
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ if (p->type != SOCKET_SOCKET)
+ return true;
+
+ if (!socket_address_can_accept(&p->address))
+ return true;
+ }
+
+ return false;
+}
+
+static int socket_add_mount_dependencies(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *path = NULL;
+
+ if (p->type == SOCKET_SOCKET)
+ path = socket_address_get_path(&p->address);
+ else if (IN_SET(p->type, SOCKET_FIFO, SOCKET_SPECIAL, SOCKET_USB_FUNCTION))
+ path = p->path;
+
+ if (!path)
+ continue;
+
+ r = unit_require_mounts_for(UNIT(s), path, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int socket_add_device_dependencies(Socket *s) {
+ char *t;
+
+ assert(s);
+
+ if (!s->bind_to_device || streq(s->bind_to_device, "lo"))
+ return 0;
+
+ t = strjoina("/sys/subsystem/net/devices/", s->bind_to_device);
+ return unit_add_node_dependency(UNIT(s), t, UNIT_BINDS_TO, UNIT_DEPENDENCY_FILE);
+}
+
+static int socket_add_default_dependencies(Socket *s) {
+ int r;
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SOCKETS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(s)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(s), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static bool socket_has_exec(Socket *s) {
+ unsigned i;
+ assert(s);
+
+ for (i = 0; i < _SOCKET_EXEC_COMMAND_MAX; i++)
+ if (s->exec_command[i])
+ return true;
+
+ return false;
+}
+
+static int socket_add_extras(Socket *s) {
+ Unit *u = UNIT(s);
+ int r;
+
+ assert(s);
+
+ /* Pick defaults for the trigger limit, if nothing was explicitly configured. We pick a relatively high limit
+ * in Accept=yes mode, and a lower limit for Accept=no. Reason: in Accept=yes mode we are invoking accept()
+ * ourselves before the trigger limit can hit, thus incoming connections are taken off the socket queue quickly
+ * and reliably. This is different for Accept=no, where the spawned service has to take the incoming traffic
+ * off the queues, which it might not necessarily do. Moreover, while Accept=no services are supposed to
+ * process whatever is queued in one go, and thus should normally never have to be started frequently. This is
+ * different for Accept=yes where each connection is processed by a new service instance, and thus frequent
+ * service starts are typical.
+ *
+ * For the poll limit we follow a similar rule, but use 3/4th of the trigger limit parameters, to
+ * trigger this earlier. */
+
+ if (s->trigger_limit.interval == USEC_INFINITY)
+ s->trigger_limit.interval = 2 * USEC_PER_SEC;
+ if (s->trigger_limit.burst == UINT_MAX)
+ s->trigger_limit.burst = s->accept ? 200 : 20;
+
+ if (s->poll_limit_interval == USEC_INFINITY)
+ s->poll_limit_interval = 2 * USEC_PER_SEC;
+ if (s->poll_limit_burst == UINT_MAX)
+ s->poll_limit_burst = s->accept ? 150 : 15;
+
+ if (have_non_accept_socket(s)) {
+
+ if (!UNIT_DEREF(s->service)) {
+ Unit *x;
+
+ r = unit_load_related_unit(u, ".service", &x);
+ if (r < 0)
+ return r;
+
+ unit_ref_set(&s->service, u, x);
+ }
+
+ r = unit_add_two_dependencies(u, UNIT_BEFORE, UNIT_TRIGGERS, UNIT_DEREF(s->service), true, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+ }
+
+ r = socket_add_mount_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = socket_add_device_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = unit_patch_contexts(u);
+ if (r < 0)
+ return r;
+
+ if (socket_has_exec(s)) {
+ r = unit_add_exec_dependencies(u, &s->exec_context);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_set_default_slice(u);
+ if (r < 0)
+ return r;
+
+ r = socket_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static const char *socket_find_symlink_target(Socket *s) {
+ const char *found = NULL;
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *f = NULL;
+
+ switch (p->type) {
+
+ case SOCKET_FIFO:
+ f = p->path;
+ break;
+
+ case SOCKET_SOCKET:
+ f = socket_address_get_path(&p->address);
+ break;
+
+ default:
+ break;
+ }
+
+ if (f) {
+ if (found)
+ return NULL;
+
+ found = f;
+ }
+ }
+
+ return found;
+}
+
+static int socket_verify(Socket *s) {
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ if (!s->ports)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has no Listen setting (ListenStream=, ListenDatagram=, ListenFIFO=, ...). Refusing.");
+
+ if (s->accept && have_non_accept_socket(s))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit configured for accepting sockets, but sockets are non-accepting. Refusing.");
+
+ if (s->accept && s->max_connections <= 0)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "MaxConnection= setting too small. Refusing.");
+
+ if (s->accept && UNIT_DEREF(s->service))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Explicit service configuration for accepting socket units not supported. Refusing.");
+
+ if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing.");
+
+ if (!strv_isempty(s->symlinks) && !socket_find_symlink_target(s))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has symlinks set but none or more than one node in the file system. Refusing.");
+
+ return 0;
+}
+
+static void peer_address_hash_func(const SocketPeer *s, struct siphash *state) {
+ assert(s);
+
+ if (s->peer.sa.sa_family == AF_INET)
+ siphash24_compress(&s->peer.in.sin_addr, sizeof(s->peer.in.sin_addr), state);
+ else if (s->peer.sa.sa_family == AF_INET6)
+ siphash24_compress(&s->peer.in6.sin6_addr, sizeof(s->peer.in6.sin6_addr), state);
+ else if (s->peer.sa.sa_family == AF_VSOCK)
+ siphash24_compress(&s->peer.vm.svm_cid, sizeof(s->peer.vm.svm_cid), state);
+ else
+ assert_not_reached();
+}
+
+static int peer_address_compare_func(const SocketPeer *x, const SocketPeer *y) {
+ int r;
+
+ r = CMP(x->peer.sa.sa_family, y->peer.sa.sa_family);
+ if (r != 0)
+ return r;
+
+ switch (x->peer.sa.sa_family) {
+ case AF_INET:
+ return memcmp(&x->peer.in.sin_addr, &y->peer.in.sin_addr, sizeof(x->peer.in.sin_addr));
+ case AF_INET6:
+ return memcmp(&x->peer.in6.sin6_addr, &y->peer.in6.sin6_addr, sizeof(x->peer.in6.sin6_addr));
+ case AF_VSOCK:
+ return CMP(x->peer.vm.svm_cid, y->peer.vm.svm_cid);
+ }
+ assert_not_reached();
+}
+
+DEFINE_PRIVATE_HASH_OPS(peer_address_hash_ops, SocketPeer, peer_address_hash_func, peer_address_compare_func);
+
+static int socket_load(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = socket_add_extras(s);
+ if (r < 0)
+ return r;
+
+ return socket_verify(s);
+}
+
+static SocketPeer *socket_peer_new(void) {
+ SocketPeer *p;
+
+ p = new(SocketPeer, 1);
+ if (!p)
+ return NULL;
+
+ *p = (SocketPeer) {
+ .n_ref = 1,
+ };
+ return p;
+}
+
+static SocketPeer *socket_peer_free(SocketPeer *p) {
+ assert(p);
+
+ if (p->socket)
+ set_remove(p->socket->peers_by_address, p);
+
+ return mfree(p);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(SocketPeer, socket_peer, socket_peer_free);
+
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **ret) {
+ _cleanup_(socket_peer_unrefp) SocketPeer *remote = NULL;
+ SocketPeer sa = {
+ .peer_salen = sizeof(union sockaddr_union),
+ }, *i;
+ int r;
+
+ assert(fd >= 0);
+ assert(s);
+ assert(ret);
+
+ if (getpeername(fd, &sa.peer.sa, &sa.peer_salen) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "getpeername() failed: %m");
+
+ if (!IN_SET(sa.peer.sa.sa_family, AF_INET, AF_INET6, AF_VSOCK)) {
+ *ret = NULL;
+ return 0;
+ }
+
+ i = set_get(s->peers_by_address, &sa);
+ if (i) {
+ *ret = socket_peer_ref(i);
+ return 1;
+ }
+
+ remote = socket_peer_new();
+ if (!remote)
+ return log_oom();
+
+ remote->peer = sa.peer;
+ remote->peer_salen = sa.peer_salen;
+
+ r = set_ensure_put(&s->peers_by_address, &peer_address_hash_ops, remote);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to insert peer info into hash table: %m");
+
+ remote->socket = s;
+
+ *ret = TAKE_PTR(remote);
+ return 1;
+}
+
+static const char* listen_lookup(int family, int type) {
+
+ if (family == AF_NETLINK)
+ return "ListenNetlink";
+
+ if (type == SOCK_STREAM)
+ return "ListenStream";
+ else if (type == SOCK_DGRAM)
+ return "ListenDatagram";
+ else if (type == SOCK_SEQPACKET)
+ return "ListenSequentialPacket";
+
+ assert_not_reached();
+ return NULL;
+}
+
+static void socket_dump(Unit *u, FILE *f, const char *prefix) {
+ Socket *s = SOCKET(u);
+ const char *prefix2, *str;
+
+ assert(s);
+ assert(f);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%sSocket State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sBindIPv6Only: %s\n"
+ "%sBacklog: %u\n"
+ "%sSocketMode: %04o\n"
+ "%sDirectoryMode: %04o\n"
+ "%sKeepAlive: %s\n"
+ "%sNoDelay: %s\n"
+ "%sFreeBind: %s\n"
+ "%sTransparent: %s\n"
+ "%sBroadcast: %s\n"
+ "%sPassCredentials: %s\n"
+ "%sPassSecurity: %s\n"
+ "%sPassPacketInfo: %s\n"
+ "%sTCPCongestion: %s\n"
+ "%sRemoveOnStop: %s\n"
+ "%sWritable: %s\n"
+ "%sFileDescriptorName: %s\n"
+ "%sSELinuxContextFromNet: %s\n",
+ prefix, socket_state_to_string(s->state),
+ prefix, socket_result_to_string(s->result),
+ prefix, socket_result_to_string(s->clean_result),
+ prefix, socket_address_bind_ipv6_only_to_string(s->bind_ipv6_only),
+ prefix, s->backlog,
+ prefix, s->socket_mode,
+ prefix, s->directory_mode,
+ prefix, yes_no(s->keep_alive),
+ prefix, yes_no(s->no_delay),
+ prefix, yes_no(s->free_bind),
+ prefix, yes_no(s->transparent),
+ prefix, yes_no(s->broadcast),
+ prefix, yes_no(s->pass_cred),
+ prefix, yes_no(s->pass_sec),
+ prefix, yes_no(s->pass_pktinfo),
+ prefix, strna(s->tcp_congestion),
+ prefix, yes_no(s->remove_on_stop),
+ prefix, yes_no(s->writable),
+ prefix, socket_fdname(s),
+ prefix, yes_no(s->selinux_context_from_net));
+
+ if (s->timestamping != SOCKET_TIMESTAMPING_OFF)
+ fprintf(f,
+ "%sTimestamping: %s\n",
+ prefix, socket_timestamping_to_string(s->timestamping));
+
+ if (pidref_is_set(&s->control_pid))
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid.pid);
+
+ if (s->bind_to_device)
+ fprintf(f,
+ "%sBindToDevice: %s\n",
+ prefix, s->bind_to_device);
+
+ if (s->accept)
+ fprintf(f,
+ "%sAccepted: %u\n"
+ "%sNConnections: %u\n"
+ "%sMaxConnections: %u\n"
+ "%sMaxConnectionsPerSource: %u\n",
+ prefix, s->n_accepted,
+ prefix, s->n_connections,
+ prefix, s->max_connections,
+ prefix, s->max_connections_per_source);
+ else
+ fprintf(f,
+ "%sFlushPending: %s\n",
+ prefix, yes_no(s->flush_pending));
+
+
+ if (s->priority >= 0)
+ fprintf(f,
+ "%sPriority: %i\n",
+ prefix, s->priority);
+
+ if (s->receive_buffer > 0)
+ fprintf(f,
+ "%sReceiveBuffer: %zu\n",
+ prefix, s->receive_buffer);
+
+ if (s->send_buffer > 0)
+ fprintf(f,
+ "%sSendBuffer: %zu\n",
+ prefix, s->send_buffer);
+
+ if (s->ip_tos >= 0)
+ fprintf(f,
+ "%sIPTOS: %i\n",
+ prefix, s->ip_tos);
+
+ if (s->ip_ttl >= 0)
+ fprintf(f,
+ "%sIPTTL: %i\n",
+ prefix, s->ip_ttl);
+
+ if (s->pipe_size > 0)
+ fprintf(f,
+ "%sPipeSize: %zu\n",
+ prefix, s->pipe_size);
+
+ if (s->mark >= 0)
+ fprintf(f,
+ "%sMark: %i\n",
+ prefix, s->mark);
+
+ if (s->mq_maxmsg > 0)
+ fprintf(f,
+ "%sMessageQueueMaxMessages: %li\n",
+ prefix, s->mq_maxmsg);
+
+ if (s->mq_msgsize > 0)
+ fprintf(f,
+ "%sMessageQueueMessageSize: %li\n",
+ prefix, s->mq_msgsize);
+
+ if (s->reuse_port)
+ fprintf(f,
+ "%sReusePort: %s\n",
+ prefix, yes_no(s->reuse_port));
+
+ if (s->smack)
+ fprintf(f,
+ "%sSmackLabel: %s\n",
+ prefix, s->smack);
+
+ if (s->smack_ip_in)
+ fprintf(f,
+ "%sSmackLabelIPIn: %s\n",
+ prefix, s->smack_ip_in);
+
+ if (s->smack_ip_out)
+ fprintf(f,
+ "%sSmackLabelIPOut: %s\n",
+ prefix, s->smack_ip_out);
+
+ if (!isempty(s->user) || !isempty(s->group))
+ fprintf(f,
+ "%sSocketUser: %s\n"
+ "%sSocketGroup: %s\n",
+ prefix, strna(s->user),
+ prefix, strna(s->group));
+
+ if (timestamp_is_set(s->keep_alive_time))
+ fprintf(f,
+ "%sKeepAliveTimeSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->keep_alive_time, USEC_PER_SEC));
+
+ if (s->keep_alive_interval > 0)
+ fprintf(f,
+ "%sKeepAliveIntervalSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->keep_alive_interval, USEC_PER_SEC));
+
+ if (s->keep_alive_cnt > 0)
+ fprintf(f,
+ "%sKeepAliveProbes: %u\n",
+ prefix, s->keep_alive_cnt);
+
+ if (s->defer_accept > 0)
+ fprintf(f,
+ "%sDeferAcceptSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->defer_accept, USEC_PER_SEC));
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ switch (p->type) {
+ case SOCKET_SOCKET: {
+ _cleanup_free_ char *k = NULL;
+ int r;
+
+ r = socket_address_print(&p->address, &k);
+ if (r < 0) {
+ errno = -r;
+ fprintf(f, "%s%s: %m\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type));
+ } else
+ fprintf(f, "%s%s: %s\n", prefix, listen_lookup(socket_address_family(&p->address), p->address.type), k);
+ break;
+ }
+ case SOCKET_SPECIAL:
+ fprintf(f, "%sListenSpecial: %s\n", prefix, p->path);
+ break;
+ case SOCKET_USB_FUNCTION:
+ fprintf(f, "%sListenUSBFunction: %s\n", prefix, p->path);
+ break;
+ case SOCKET_MQUEUE:
+ fprintf(f, "%sListenMessageQueue: %s\n", prefix, p->path);
+ break;
+ default:
+ fprintf(f, "%sListenFIFO: %s\n", prefix, p->path);
+ }
+ }
+
+ fprintf(f,
+ "%sTriggerLimitIntervalSec: %s\n"
+ "%sTriggerLimitBurst: %u\n"
+ "%sPollLimitIntervalSec: %s\n"
+ "%sPollLimitBurst: %u\n",
+ prefix, FORMAT_TIMESPAN(s->trigger_limit.interval, USEC_PER_SEC),
+ prefix, s->trigger_limit.burst,
+ prefix, FORMAT_TIMESPAN(s->poll_limit_interval, USEC_PER_SEC),
+ prefix, s->poll_limit_burst);
+
+ str = ip_protocol_to_name(s->socket_protocol);
+ if (str)
+ fprintf(f, "%sSocketProtocol: %s\n", prefix, str);
+
+ if (!strv_isempty(s->symlinks)) {
+ fprintf(f, "%sSymlinks:", prefix);
+ STRV_FOREACH(q, s->symlinks)
+ fprintf(f, " %s", *q);
+
+ fprintf(f, "\n");
+ }
+
+ fprintf(f,
+ "%sTimeoutSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+ exec_context_dump(&s->exec_context, f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+
+ for (SocketExecCommand c = 0; c < _SOCKET_EXEC_COMMAND_MAX; c++) {
+ if (!s->exec_command[c])
+ continue;
+
+ fprintf(f, "%s-> %s:\n",
+ prefix, socket_exec_command_to_string(c));
+
+ exec_command_dump_list(s->exec_command[c], f, prefix2);
+ }
+
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int instance_from_socket(int fd, unsigned nr, char **instance) {
+ socklen_t l;
+ char *r;
+ union sockaddr_union local, remote;
+
+ assert(fd >= 0);
+ assert(instance);
+
+ l = sizeof(local);
+ if (getsockname(fd, &local.sa, &l) < 0)
+ return -errno;
+
+ l = sizeof(remote);
+ if (getpeername(fd, &remote.sa, &l) < 0)
+ return -errno;
+
+ switch (local.sa.sa_family) {
+
+ case AF_INET: {
+ uint32_t
+ a = be32toh(local.in.sin_addr.s_addr),
+ b = be32toh(remote.in.sin_addr.s_addr);
+
+ if (asprintf(&r,
+ "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+ nr,
+ a >> 24, (a >> 16) & 0xFF, (a >> 8) & 0xFF, a & 0xFF,
+ be16toh(local.in.sin_port),
+ b >> 24, (b >> 16) & 0xFF, (b >> 8) & 0xFF, b & 0xFF,
+ be16toh(remote.in.sin_port)) < 0)
+ return -ENOMEM;
+
+ break;
+ }
+
+ case AF_INET6: {
+ static const unsigned char ipv4_prefix[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF
+ };
+
+ if (memcmp(&local.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0 &&
+ memcmp(&remote.in6.sin6_addr, ipv4_prefix, sizeof(ipv4_prefix)) == 0) {
+ const uint8_t
+ *a = local.in6.sin6_addr.s6_addr+12,
+ *b = remote.in6.sin6_addr.s6_addr+12;
+
+ if (asprintf(&r,
+ "%u-%u.%u.%u.%u:%u-%u.%u.%u.%u:%u",
+ nr,
+ a[0], a[1], a[2], a[3],
+ be16toh(local.in6.sin6_port),
+ b[0], b[1], b[2], b[3],
+ be16toh(remote.in6.sin6_port)) < 0)
+ return -ENOMEM;
+ } else {
+ if (asprintf(&r,
+ "%u-%s:%u-%s:%u",
+ nr,
+ IN6_ADDR_TO_STRING(&local.in6.sin6_addr),
+ be16toh(local.in6.sin6_port),
+ IN6_ADDR_TO_STRING(&remote.in6.sin6_addr),
+ be16toh(remote.in6.sin6_port)) < 0)
+ return -ENOMEM;
+ }
+
+ break;
+ }
+
+ case AF_UNIX: {
+ struct ucred ucred;
+ int k;
+
+ k = getpeercred(fd, &ucred);
+ if (k >= 0) {
+ if (asprintf(&r,
+ "%u-"PID_FMT"-"UID_FMT,
+ nr, ucred.pid, ucred.uid) < 0)
+ return -ENOMEM;
+ } else if (k == -ENODATA) {
+ /* This handles the case where somebody is
+ * connecting from another pid/uid namespace
+ * (e.g. from outside of our container). */
+ if (asprintf(&r,
+ "%u-unknown",
+ nr) < 0)
+ return -ENOMEM;
+ } else
+ return k;
+
+ break;
+ }
+
+ case AF_VSOCK:
+ if (asprintf(&r,
+ "%u-%u:%u-%u:%u",
+ nr,
+ local.vm.svm_cid, local.vm.svm_port,
+ remote.vm.svm_cid, remote.vm.svm_port) < 0)
+ return -ENOMEM;
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ *instance = r;
+ return 0;
+}
+
+static void socket_close_fds(Socket *s) {
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ bool was_open;
+
+ was_open = p->fd >= 0;
+
+ p->event_source = sd_event_source_disable_unref(p->event_source);
+ p->fd = safe_close(p->fd);
+ socket_cleanup_fd_list(p);
+
+ /* One little note: we should normally not delete any sockets in the file system here! After all some
+ * other process we spawned might still have a reference of this fd and wants to continue to use
+ * it. Therefore we normally delete sockets in the file system before we create a new one, not after we
+ * stopped using one! That all said, if the user explicitly requested this, we'll delete them here
+ * anyway, but only then. */
+
+ if (!was_open || !s->remove_on_stop)
+ continue;
+
+ switch (p->type) {
+
+ case SOCKET_FIFO:
+ (void) unlink(p->path);
+ break;
+
+ case SOCKET_MQUEUE:
+ (void) mq_unlink(p->path);
+ break;
+
+ case SOCKET_SOCKET:
+ (void) socket_address_unlink(&p->address);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (s->remove_on_stop)
+ STRV_FOREACH(i, s->symlinks)
+ (void) unlink(*i);
+
+ /* Note that we don't return NULL here, since s has not been freed. */
+}
+
+static void socket_apply_socket_options(Socket *s, SocketPort *p, int fd) {
+ int r;
+
+ assert(s);
+ assert(p);
+ assert(fd >= 0);
+
+ if (s->keep_alive) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_KEEPALIVE failed: %m");
+ }
+
+ if (timestamp_is_set(s->keep_alive_time)) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPIDLE, s->keep_alive_time / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPIDLE failed: %m");
+ }
+
+ if (s->keep_alive_interval > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPINTVL, s->keep_alive_interval / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPINTVL failed: %m");
+ }
+
+ if (s->keep_alive_cnt > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_KEEPCNT, s->keep_alive_cnt);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_KEEPCNT failed: %m");
+ }
+
+ if (s->defer_accept > 0) {
+ r = setsockopt_int(fd, SOL_TCP, TCP_DEFER_ACCEPT, s->defer_accept / USEC_PER_SEC);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_DEFER_ACCEPT failed: %m");
+ }
+
+ if (s->no_delay) {
+ if (s->socket_protocol == IPPROTO_SCTP) {
+ r = setsockopt_int(fd, SOL_SCTP, SCTP_NODELAY, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SCTP_NODELAY failed: %m");
+ } else {
+ r = setsockopt_int(fd, SOL_TCP, TCP_NODELAY, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "TCP_NODELAY failed: %m");
+ }
+ }
+
+ if (s->broadcast) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_BROADCAST, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_BROADCAST failed: %m");
+ }
+
+ if (s->pass_cred) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PASSCRED failed: %m");
+ }
+
+ if (s->pass_sec) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSSEC, true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PASSSEC failed: %m");
+ }
+
+ if (s->pass_pktinfo) {
+ r = socket_set_recvpktinfo(fd, socket_address_family(&p->address), true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to enable packet info socket option: %m");
+ }
+
+ if (s->timestamping != SOCKET_TIMESTAMPING_OFF) {
+ r = setsockopt_int(fd, SOL_SOCKET,
+ s->timestamping == SOCKET_TIMESTAMPING_NS ? SO_TIMESTAMPNS : SO_TIMESTAMP,
+ true);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to enable timestamping socket option, ignoring: %m");
+ }
+
+ if (s->priority >= 0) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PRIORITY, s->priority);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_PRIORITY failed: %m");
+ }
+
+ if (s->receive_buffer > 0) {
+ r = fd_set_rcvbuf(fd, s->receive_buffer, false);
+ if (r < 0)
+ log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "SO_RCVBUF/SO_RCVBUFFORCE failed: %m");
+ }
+
+ if (s->send_buffer > 0) {
+ r = fd_set_sndbuf(fd, s->send_buffer, false);
+ if (r < 0)
+ log_unit_full_errno(UNIT(s), ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
+ "SO_SNDBUF/SO_SNDBUFFORCE failed: %m");
+ }
+
+ if (s->mark >= 0) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_MARK, s->mark);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "SO_MARK failed: %m");
+ }
+
+ if (s->ip_tos >= 0) {
+ r = setsockopt_int(fd, IPPROTO_IP, IP_TOS, s->ip_tos);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "IP_TOS failed: %m");
+ }
+
+ if (s->ip_ttl >= 0) {
+ r = socket_set_ttl(fd, socket_address_family(&p->address), s->ip_ttl);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "IP_TTL/IPV6_UNICAST_HOPS failed: %m");
+ }
+
+ if (s->tcp_congestion)
+ if (setsockopt(fd, SOL_TCP, TCP_CONGESTION, s->tcp_congestion, strlen(s->tcp_congestion)+1) < 0)
+ log_unit_warning_errno(UNIT(s), errno, "TCP_CONGESTION failed: %m");
+
+ if (s->smack_ip_in) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_IPIN, s->smack_ip_in);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_in_fd: %m");
+ }
+
+ if (s->smack_ip_out) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_IPOUT, s->smack_ip_out);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "mac_smack_apply_ip_out_fd: %m");
+ }
+}
+
+static void socket_apply_fifo_options(Socket *s, int fd) {
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ if (s->pipe_size > 0)
+ if (fcntl(fd, F_SETPIPE_SZ, s->pipe_size) < 0)
+ log_unit_warning_errno(UNIT(s), errno, "Setting pipe size failed, ignoring: %m");
+
+ if (s->smack) {
+ r = mac_smack_apply_fd(fd, SMACK_ATTR_ACCESS, s->smack);
+ if (r < 0)
+ log_unit_error_errno(UNIT(s), r, "SMACK relabelling failed, ignoring: %m");
+ }
+}
+
+static int fifo_address_create(
+ const char *path,
+ mode_t directory_mode,
+ mode_t socket_mode) {
+
+ _cleanup_close_ int fd = -EBADF;
+ mode_t old_mask;
+ struct stat st;
+ int r;
+
+ assert(path);
+
+ (void) mkdir_parents_label(path, directory_mode);
+
+ r = mac_selinux_create_file_prepare(path, S_IFIFO);
+ if (r < 0)
+ return r;
+
+ /* Enforce the right access mode for the fifo */
+ old_mask = umask(~socket_mode);
+
+ /* Include the original umask in our mask */
+ (void) umask(~socket_mode | old_mask);
+
+ r = mkfifo(path, socket_mode);
+ (void) umask(old_mask);
+
+ if (r < 0 && errno != EEXIST) {
+ r = -errno;
+ goto fail;
+ }
+
+ fd = open(path, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK | O_NOFOLLOW);
+ if (fd < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ mac_selinux_create_file_clear();
+
+ if (fstat(fd, &st) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (!S_ISFIFO(st.st_mode) ||
+ (st.st_mode & 0777) != (socket_mode & ~old_mask) ||
+ st.st_uid != getuid() ||
+ st.st_gid != getgid()) {
+ r = -EEXIST;
+ goto fail;
+ }
+
+ return TAKE_FD(fd);
+
+fail:
+ mac_selinux_create_file_clear();
+ return r;
+}
+
+static int special_address_create(const char *path, bool writable) {
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+
+ assert(path);
+
+ fd = open(path, (writable ? O_RDWR : O_RDONLY)|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Check whether this is a /proc, /sys or /dev file or char device */
+ if (!S_ISREG(st.st_mode) && !S_ISCHR(st.st_mode))
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int usbffs_address_create(const char *path) {
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+
+ assert(path);
+
+ fd = open(path, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Check whether this is a regular file (ffs endpoint) */
+ if (!S_ISREG(st.st_mode))
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int mq_address_create(
+ const char *path,
+ mode_t mq_mode,
+ long maxmsg,
+ long msgsize) {
+
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+ mode_t old_mask;
+ struct mq_attr _attr, *attr = NULL;
+
+ assert(path);
+
+ if (maxmsg > 0 && msgsize > 0) {
+ _attr = (struct mq_attr) {
+ .mq_flags = O_NONBLOCK,
+ .mq_maxmsg = maxmsg,
+ .mq_msgsize = msgsize,
+ };
+ attr = &_attr;
+ }
+
+ /* Enforce the right access mode for the mq */
+ old_mask = umask(~mq_mode);
+
+ /* Include the original umask in our mask */
+ (void) umask(~mq_mode | old_mask);
+ fd = mq_open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_CREAT, mq_mode, attr);
+ (void) umask(old_mask);
+
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if ((st.st_mode & 0777) != (mq_mode & ~old_mask) ||
+ st.st_uid != getuid() ||
+ st.st_gid != getgid())
+ return -EEXIST;
+
+ return TAKE_FD(fd);
+}
+
+static int socket_symlink(Socket *s) {
+ const char *p;
+ int r;
+
+ assert(s);
+
+ p = socket_find_symlink_target(s);
+ if (!p)
+ return 0;
+
+ STRV_FOREACH(i, s->symlinks) {
+ (void) mkdir_parents_label(*i, s->directory_mode);
+
+ r = symlink_idempotent(p, *i, false);
+
+ if (r == -EEXIST && s->remove_on_stop) {
+ /* If there's already something where we want to create the symlink, and the destructive
+ * RemoveOnStop= mode is set, then we might as well try to remove what already exists and try
+ * again. */
+
+ if (unlink(*i) >= 0)
+ r = symlink_idempotent(p, *i, false);
+ }
+
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to create symlink %s %s %s, ignoring: %m",
+ p, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), *i);
+ }
+
+ return 0;
+}
+
+static int usbffs_write_descs(int fd, Service *s) {
+ int r;
+
+ if (!s->usb_function_descriptors || !s->usb_function_strings)
+ return -EINVAL;
+
+ r = copy_file_fd(s->usb_function_descriptors, fd, 0);
+ if (r < 0)
+ return r;
+
+ return copy_file_fd(s->usb_function_strings, fd, 0);
+}
+
+static int usbffs_select_ep(const struct dirent *d) {
+ return d->d_name[0] != '.' && !streq(d->d_name, "ep0");
+}
+
+static int usbffs_dispatch_eps(SocketPort *p) {
+ _cleanup_free_ struct dirent **ent = NULL;
+ size_t n, k;
+ int r;
+
+ r = scandir(p->path, &ent, usbffs_select_ep, alphasort);
+ if (r < 0)
+ return -errno;
+
+ n = (size_t) r;
+ p->auxiliary_fds = new(int, n);
+ if (!p->auxiliary_fds) {
+ r = -ENOMEM;
+ goto clear;
+ }
+
+ p->n_auxiliary_fds = n;
+
+ k = 0;
+ for (size_t i = 0; i < n; ++i) {
+ _cleanup_free_ char *ep = NULL;
+
+ ep = path_make_absolute(ent[i]->d_name, p->path);
+ if (!ep) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ path_simplify(ep);
+
+ r = usbffs_address_create(ep);
+ if (r < 0)
+ goto fail;
+
+ p->auxiliary_fds[k++] = r;
+ }
+
+ r = 0;
+ goto clear;
+
+fail:
+ close_many(p->auxiliary_fds, k);
+ p->auxiliary_fds = mfree(p->auxiliary_fds);
+ p->n_auxiliary_fds = 0;
+
+clear:
+ free_many((void**) ent, n);
+ return r;
+}
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret) {
+ /* Figure out what the unit that will be used to handle the connections on the socket looks like.
+ *
+ * If cfd < 0, then we don't have a connection yet. In case of Accept=yes sockets, use a fake
+ * instance name.
+ */
+
+ if (UNIT_ISSET(s->service)) {
+ *ret = UNIT_DEREF(s->service);
+ return 0;
+ }
+
+ if (!s->accept)
+ return -ENODATA;
+
+ /* Build the instance name and load the unit */
+ _cleanup_free_ char *prefix = NULL, *instance = NULL, *name = NULL;
+ int r;
+
+ r = unit_name_to_prefix(UNIT(s)->id, &prefix);
+ if (r < 0)
+ return r;
+
+ if (cfd >= 0) {
+ r = instance_from_socket(cfd, s->n_accepted, &instance);
+ if (r < 0) {
+ if (ERRNO_IS_DISCONNECT(r))
+ /* ENOTCONN is legitimate if TCP RST was received. Other socket families might return
+ * different errors. This connection is over, but the socket unit lives on. */
+ return log_unit_debug_errno(UNIT(s), r,
+ "Got %s on incoming socket, assuming aborted connection attempt, ignoring.",
+ errno_to_name(r));
+ return r;
+ }
+ }
+
+ /* For accepting sockets, we don't know how the instance will be called until we get a connection and
+ * can figure out what the peer name is. So let's use "internal" as the instance to make it clear
+ * that this is not an actual peer name. We use "unknown" when we cannot figure out the peer. */
+ r = unit_name_build(prefix, instance ?: "internal", ".service", &name);
+ if (r < 0)
+ return r;
+
+ return manager_load_unit(UNIT(s)->manager, name, NULL, NULL, ret);
+}
+
+static int socket_determine_selinux_label(Socket *s, char **ret) {
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ Unit *service;
+ ExecCommand *c;
+ const char *exec_context;
+ _cleanup_free_ char *path = NULL;
+
+ r = socket_load_service_unit(s, -1, &service);
+ if (r == -ENODATA)
+ goto no_label;
+ if (r < 0)
+ return r;
+
+ exec_context = SERVICE(service)->exec_context.selinux_context;
+ if (exec_context) {
+ char *con;
+
+ con = strdup(exec_context);
+ if (!con)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(con);
+ return 0;
+ }
+
+ c = SERVICE(service)->exec_command[SERVICE_EXEC_START];
+ if (!c)
+ goto no_label;
+
+ r = chase(c->path, SERVICE(service)->exec_context.root_directory, CHASE_PREFIX_ROOT, &path, NULL);
+ if (r < 0)
+ goto no_label;
+
+ r = mac_selinux_get_create_label_from_exe(path, ret);
+ if (IN_SET(r, -EPERM, -EOPNOTSUPP))
+ goto no_label;
+ return r;
+
+no_label:
+ *ret = NULL;
+ return 0;
+}
+
+static int socket_address_listen_do(
+ Socket *s,
+ const SocketAddress *address,
+ const char *label) {
+
+ assert(s);
+ assert(address);
+
+ return socket_address_listen(
+ address,
+ SOCK_CLOEXEC|SOCK_NONBLOCK,
+ s->backlog,
+ s->bind_ipv6_only,
+ s->bind_to_device,
+ s->reuse_port,
+ s->free_bind,
+ s->transparent,
+ s->directory_mode,
+ s->socket_mode,
+ label);
+}
+
+#define log_address_error_errno(u, address, error, fmt) \
+ ({ \
+ _cleanup_free_ char *_t = NULL; \
+ \
+ (void) socket_address_print(address, &_t); \
+ log_unit_error_errno(u, error, fmt, strna(_t)); \
+ })
+
+static int fork_needed(const SocketAddress *address, Socket *s) {
+ int r;
+
+ assert(address);
+ assert(s);
+
+ /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */
+
+ /* If there are any NFTSet= directives with cgroup source, we need the cgroup */
+ Unit *u = UNIT(s);
+ CGroupContext *c = unit_get_cgroup_context(u);
+ if (c)
+ FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets)
+ if (nft_set->source == NFT_SET_SOURCE_CGROUP)
+ return true;
+
+ if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) {
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */
+ return true;
+ }
+
+ return exec_needs_network_namespace(&s->exec_context);
+}
+
+static int socket_address_listen_in_cgroup(
+ Socket *s,
+ const SocketAddress *address,
+ const char *label) {
+
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+ _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+ int fd, r;
+
+ assert(s);
+ assert(address);
+
+ /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the
+ * socket's cgroup and network namespace in which the socket is actually created. This way we ensure
+ * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and
+ * such. */
+
+ r = fork_needed(address, s);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Shortcut things... */
+ fd = socket_address_listen_do(s, address, label);
+ if (fd < 0)
+ return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+
+ return fd;
+ }
+
+ r = unit_setup_exec_runtime(UNIT(s));
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m");
+
+ if (s->exec_context.network_namespace_path &&
+ s->exec_runtime &&
+ s->exec_runtime->shared &&
+ s->exec_runtime->shared->netns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(s->exec_runtime->shared->netns_storage_socket, s->exec_context.network_namespace_path, CLONE_NEWNET);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path);
+ }
+
+ if (s->exec_context.ipc_namespace_path &&
+ s->exec_runtime &&
+ s->exec_runtime->shared &&
+ s->exec_runtime->shared->ipcns_storage_socket[0] >= 0) {
+ r = open_shareable_ns_path(s->exec_runtime->shared->ipcns_storage_socket, s->exec_context.ipc_namespace_path, CLONE_NEWIPC);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to open IPC namespace path %s: %m", s->exec_context.ipc_namespace_path);
+ }
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-listen)", &pid);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m");
+ if (r == 0) {
+ /* Child */
+
+ pair[0] = safe_close(pair[0]);
+
+ if (exec_needs_network_namespace(&s->exec_context) &&
+ s->exec_runtime &&
+ s->exec_runtime->shared &&
+ s->exec_runtime->shared->netns_storage_socket[0] >= 0) {
+
+ if (ns_type_supported(NAMESPACE_NET)) {
+ r = setup_shareable_ns(s->exec_runtime->shared->netns_storage_socket, CLONE_NEWNET);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m");
+ _exit(EXIT_NETWORK);
+ }
+ } else if (s->exec_context.network_namespace_path) {
+ log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported.");
+ _exit(EXIT_NETWORK);
+ } else
+ log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
+ }
+
+ fd = socket_address_listen_do(s, address, label);
+ if (fd < 0) {
+ log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = send_one_fd(pair[1], fd, 0);
+ if (r < 0) {
+ log_address_error_errno(UNIT(s), address, r, "Failed to send listening socket (%s) to parent: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pair[1] = safe_close(pair[1]);
+ fd = receive_one_fd(pair[0], 0);
+
+ /* We synchronously wait for the helper, as it shouldn't be slow */
+ r = wait_for_terminate_and_check("(sd-listen)", pid.pid, WAIT_LOG_ABNORMAL);
+ if (r < 0) {
+ safe_close(fd);
+ return r;
+ }
+
+ if (fd < 0)
+ return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m");
+
+ return fd;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Socket *, socket_close_fds, NULL);
+
+static int socket_open_fds(Socket *orig_s) {
+ _cleanup_(socket_close_fdsp) Socket *s = orig_s;
+ _cleanup_(mac_selinux_freep) char *label = NULL;
+ bool know_label = false;
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+
+ if (p->fd >= 0)
+ continue;
+
+ switch (p->type) {
+
+ case SOCKET_SOCKET:
+
+ if (!know_label) {
+ /* Figure out the label, if we don't it know yet. We do it once for the first
+ * socket where we need this and remember it for the rest. */
+
+ r = socket_determine_selinux_label(s, &label);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to determine SELinux label: %m");
+
+ know_label = true;
+ }
+
+ /* Apply the socket protocol */
+ switch (p->address.type) {
+
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ if (s->socket_protocol == IPPROTO_SCTP)
+ p->address.protocol = s->socket_protocol;
+ break;
+
+ case SOCK_DGRAM:
+ if (s->socket_protocol == IPPROTO_UDPLITE)
+ p->address.protocol = s->socket_protocol;
+ break;
+ }
+
+ p->fd = socket_address_listen_in_cgroup(s, &p->address, label);
+ if (p->fd < 0)
+ return p->fd;
+
+ socket_apply_socket_options(s, p, p->fd);
+ socket_symlink(s);
+ break;
+
+ case SOCKET_SPECIAL:
+
+ p->fd = special_address_create(p->path, s->writable);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open special file %s: %m", p->path);
+ break;
+
+ case SOCKET_FIFO:
+
+ p->fd = fifo_address_create(
+ p->path,
+ s->directory_mode,
+ s->socket_mode);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open FIFO %s: %m", p->path);
+
+ socket_apply_fifo_options(s, p->fd);
+ socket_symlink(s);
+ break;
+
+ case SOCKET_MQUEUE:
+
+ p->fd = mq_address_create(
+ p->path,
+ s->socket_mode,
+ s->mq_maxmsg,
+ s->mq_msgsize);
+ if (p->fd < 0)
+ return log_unit_error_errno(UNIT(s), p->fd, "Failed to open message queue %s: %m", p->path);
+ break;
+
+ case SOCKET_USB_FUNCTION: {
+ _cleanup_free_ char *ep = NULL;
+
+ ep = path_make_absolute("ep0", p->path);
+ if (!ep)
+ return -ENOMEM;
+
+ p->fd = usbffs_address_create(ep);
+ if (p->fd < 0)
+ return p->fd;
+
+ r = usbffs_write_descs(p->fd, SERVICE(UNIT_DEREF(s->service)));
+ if (r < 0)
+ return r;
+
+ r = usbffs_dispatch_eps(p);
+ if (r < 0)
+ return r;
+
+ break;
+ }
+ default:
+ assert_not_reached();
+ }
+ }
+
+ s = NULL;
+ return 0;
+}
+
+static void socket_unwatch_fds(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ if (!p->event_source)
+ continue;
+
+ r = sd_event_source_set_enabled(p->event_source, SD_EVENT_OFF);
+ if (r < 0)
+ log_unit_debug_errno(UNIT(s), r, "Failed to disable event source: %m");
+ }
+}
+
+static int socket_watch_fds(Socket *s) {
+ int r;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ if (p->event_source) {
+ r = sd_event_source_set_enabled(p->event_source, SD_EVENT_ON);
+ if (r < 0)
+ goto fail;
+ } else {
+ r = sd_event_add_io(UNIT(s)->manager->event, &p->event_source, p->fd, EPOLLIN, socket_dispatch_io, p);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(p->event_source, "socket-port-io");
+ }
+
+ r = sd_event_source_set_ratelimit(p->event_source, s->poll_limit_interval, s->poll_limit_burst);
+ if (r < 0)
+ log_unit_debug_errno(UNIT(s), r, "Failed to set poll limit on I/O event source, ignoring: %m");
+ }
+
+ return 0;
+
+fail:
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch listening fds: %m");
+ socket_unwatch_fds(s);
+ return r;
+}
+
+enum {
+ SOCKET_OPEN_NONE,
+ SOCKET_OPEN_SOME,
+ SOCKET_OPEN_ALL,
+};
+
+static int socket_check_open(Socket *s) {
+ bool have_open = false, have_closed = false;
+
+ assert(s);
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ have_closed = true;
+ else
+ have_open = true;
+
+ if (have_open && have_closed)
+ return SOCKET_OPEN_SOME;
+ }
+
+ if (have_open)
+ return SOCKET_OPEN_ALL;
+
+ return SOCKET_OPEN_NONE;
+}
+
+static void socket_set_state(Socket *s, SocketState state) {
+ SocketState old_state;
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!IN_SET(state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING)) {
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ socket_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != SOCKET_LISTENING)
+ socket_unwatch_fds(s);
+
+ if (!IN_SET(state,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_LISTENING,
+ SOCKET_RUNNING,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_CLEANING))
+ socket_close_fds(s);
+
+ if (state != old_state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", socket_state_to_string(old_state), socket_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int socket_coldplug(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(s->state == SOCKET_DEAD);
+
+ if (s->deserialized_state == s->state)
+ return 0;
+
+ if (pidref_is_set(&s->control_pid) &&
+ pidref_is_unwaited(&s->control_pid) > 0 &&
+ IN_SET(s->deserialized_state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING)) {
+
+ r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+ if (r < 0)
+ return r;
+
+ r = socket_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (IN_SET(s->deserialized_state,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST,
+ SOCKET_LISTENING,
+ SOCKET_RUNNING)) {
+
+ /* Originally, we used to simply reopen all sockets here that we didn't have file descriptors
+ * for. However, this is problematic, as we won't traverse through the SOCKET_START_CHOWN state for
+ * them, and thus the UID/GID wouldn't be right. Hence, instead simply check if we have all fds open,
+ * and if there's a mismatch, warn loudly. */
+
+ r = socket_check_open(s);
+ if (r == SOCKET_OPEN_NONE)
+ log_unit_warning(UNIT(s),
+ "Socket unit configuration has changed while unit has been running, "
+ "no open socket file descriptor left. "
+ "The socket unit is not functional until restarted.");
+ else if (r == SOCKET_OPEN_SOME)
+ log_unit_warning(UNIT(s),
+ "Socket unit configuration has changed while unit has been running, "
+ "and some socket file descriptors have not been opened yet. "
+ "The socket unit is not fully functional until restarted.");
+ }
+
+ if (s->deserialized_state == SOCKET_LISTENING) {
+ r = socket_watch_fds(s);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(s->deserialized_state, SOCKET_DEAD, SOCKET_FAILED, SOCKET_CLEANING))
+ (void) unit_setup_exec_runtime(u);
+
+ socket_set_state(s, s->deserialized_state);
+ return 0;
+}
+
+static int socket_spawn(Socket *s, ExecCommand *c, PidRef *ret_pid) {
+
+ _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(c);
+ assert(ret_pid);
+
+ r = unit_prepare_exec(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+ if (r < 0)
+ return r;
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ return r;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->cgroup_context,
+ &pid);
+ if (r < 0)
+ return r;
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pidref);
+ return 0;
+}
+
+static int socket_chown(Socket *s, PidRef *ret_pid) {
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+ int r;
+
+ assert(s);
+
+ r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+ if (r < 0)
+ return r;
+
+ /* We have to resolve the user names out-of-process, hence
+ * let's fork here. It's messy, but well, what can we do? */
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-chown)", &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ uid_t uid = UID_INVALID;
+ gid_t gid = GID_INVALID;
+
+ /* Child */
+
+ if (!isempty(s->user)) {
+ const char *user = s->user;
+
+ r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user);
+ _exit(EXIT_USER);
+ }
+ }
+
+ if (!isempty(s->group)) {
+ const char *group = s->group;
+
+ r = get_group_creds(&group, &gid, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group);
+ _exit(EXIT_GROUP);
+ }
+ }
+
+ LIST_FOREACH(port, p, s->ports) {
+ const char *path = NULL;
+
+ if (p->type == SOCKET_SOCKET)
+ path = socket_address_get_path(&p->address);
+ else if (p->type == SOCKET_FIFO)
+ path = p->path;
+
+ if (!path)
+ continue;
+
+ if (chown(path, uid, gid) < 0) {
+ log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m");
+ _exit(EXIT_CHOWN);
+ }
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ r = unit_watch_pidref(UNIT(s), &pid, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pid);
+ return 0;
+}
+
+static void socket_enter_dead(Socket *s, SocketResult f) {
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ if (s->result == SOCKET_SUCCESS)
+ unit_log_success(UNIT(s));
+ else
+ unit_log_failure(UNIT(s), socket_result_to_string(s->result));
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+
+ socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD);
+
+ s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ unit_unref_uid_gid(UNIT(s), true);
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f);
+
+static void socket_enter_stop_post(Socket *s, SocketResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_STOP_POST;
+ s->control_command = s->exec_command[SOCKET_EXEC_STOP_POST];
+
+ if (s->control_command) {
+ pidref_done(&s->control_pid);
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-post' task: %m");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+ return;
+ }
+
+ socket_set_state(s, SOCKET_STOP_POST);
+ } else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_SUCCESS);
+}
+
+static int state_to_kill_operation(Socket *s, SocketState state) {
+ if (state == SOCKET_STOP_PRE_SIGTERM && unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+
+ if (state == SOCKET_FINAL_SIGTERM)
+ return KILL_TERMINATE;
+
+ return KILL_KILL;
+}
+
+static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ state_to_kill_operation(s, state),
+ /* main_pid= */ NULL,
+ &s->control_pid,
+ /* main_pid_alien= */ false);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+ goto fail;
+ }
+
+ if (r > 0) {
+ r = socket_arm_timer(s, /* relative= */ true, s->timeout_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, state);
+ } else if (state == SOCKET_STOP_PRE_SIGTERM)
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_SUCCESS);
+ else if (state == SOCKET_STOP_PRE_SIGKILL)
+ socket_enter_stop_post(s, SOCKET_SUCCESS);
+ else if (state == SOCKET_FINAL_SIGTERM)
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+ else
+ socket_enter_dead(s, SOCKET_SUCCESS);
+
+ return;
+
+fail:
+ if (IN_SET(state, SOCKET_STOP_PRE_SIGTERM, SOCKET_STOP_PRE_SIGKILL))
+ socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+ else
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_stop_pre(Socket *s, SocketResult f) {
+ int r;
+ assert(s);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_STOP_PRE;
+ s->control_command = s->exec_command[SOCKET_EXEC_STOP_PRE];
+
+ if (s->control_command) {
+ pidref_done(&s->control_pid);
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'stop-pre' task: %m");
+ socket_enter_stop_post(s, SOCKET_FAILURE_RESOURCES);
+ return;
+ }
+
+ socket_set_state(s, SOCKET_STOP_PRE);
+ } else
+ socket_enter_stop_post(s, SOCKET_SUCCESS);
+}
+
+static void socket_enter_listening(Socket *s) {
+ int r;
+ assert(s);
+
+ if (!s->accept && s->flush_pending) {
+ log_unit_debug(UNIT(s), "Flushing socket before listening.");
+ flush_ports(s);
+ }
+
+ r = socket_watch_fds(s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to watch sockets: %m");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+ return;
+ }
+
+ socket_set_state(s, SOCKET_LISTENING);
+}
+
+static void socket_enter_start_post(Socket *s) {
+ int r;
+ assert(s);
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_START_POST;
+ s->control_command = s->exec_command[SOCKET_EXEC_START_POST];
+
+ if (s->control_command) {
+ pidref_done(&s->control_pid);
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-post' task: %m");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+ return;
+ }
+
+ socket_set_state(s, SOCKET_START_POST);
+ } else
+ socket_enter_listening(s);
+}
+
+static void socket_enter_start_chown(Socket *s) {
+ int r;
+
+ assert(s);
+
+ r = socket_open_fds(s);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to listen on sockets: %m");
+ goto fail;
+ }
+
+ if (!isempty(s->user) || !isempty(s->group)) {
+
+ socket_unwatch_control_pid(s);
+ s->control_command_id = SOCKET_EXEC_START_CHOWN;
+ s->control_command = NULL;
+
+ r = socket_chown(s, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-chown' task: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_START_CHOWN);
+ } else
+ socket_enter_start_post(s);
+
+ return;
+
+fail:
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_enter_start_pre(Socket *s) {
+ int r;
+ assert(s);
+
+ socket_unwatch_control_pid(s);
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+ s->control_command_id = SOCKET_EXEC_START_PRE;
+ s->control_command = s->exec_command[SOCKET_EXEC_START_PRE];
+
+ if (s->control_command) {
+ pidref_done(&s->control_pid);
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'start-pre' task: %m");
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+ return;
+ }
+
+ socket_set_state(s, SOCKET_START_PRE);
+ } else
+ socket_enter_start_chown(s);
+}
+
+static void flush_ports(Socket *s) {
+ assert(s);
+
+ /* Flush all incoming traffic, regardless if actual bytes or new connections, so that this socket isn't busy
+ * anymore */
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd < 0)
+ continue;
+
+ (void) flush_accept(p->fd);
+ (void) flush_fd(p->fd);
+ }
+}
+
+static void socket_enter_running(Socket *s, int cfd_in) {
+ /* Note that this call takes possession of the connection fd passed. It either has to assign it
+ * somewhere or close it. */
+ _cleanup_close_ int cfd = cfd_in;
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(s);
+
+ /* We don't take connections anymore if we are supposed to shut down anyway */
+ if (unit_stop_pending(UNIT(s))) {
+
+ log_unit_debug(UNIT(s), "Suppressing connection request since unit stop is scheduled.");
+
+ if (cfd >= 0)
+ goto refuse;
+
+ flush_ports(s);
+ return;
+ }
+
+ if (!ratelimit_below(&s->trigger_limit)) {
+ log_unit_warning(UNIT(s), "Trigger limit hit, refusing further activation.");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_TRIGGER_LIMIT_HIT);
+ goto refuse;
+ }
+
+ if (cfd < 0) { /* Accept=no case */
+ bool pending = false;
+ Unit *other;
+
+ /* If there's already a start pending don't bother to do anything */
+ UNIT_FOREACH_DEPENDENCY(other, UNIT(s), UNIT_ATOM_TRIGGERS)
+ if (unit_active_or_pending(other)) {
+ pending = true;
+ break;
+ }
+
+ if (!pending) {
+ if (!UNIT_ISSET(s->service)) {
+ r = log_unit_warning_errno(UNIT(s), SYNTHETIC_ERRNO(ENOENT),
+ "Service to activate vanished, refusing activation.");
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(s)->manager, JOB_START, UNIT_DEREF(s->service), JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0)
+ goto queue_error;
+ }
+
+ socket_set_state(s, SOCKET_RUNNING);
+ } else { /* Accept=yes case */
+ _cleanup_(socket_peer_unrefp) SocketPeer *p = NULL;
+ Unit *service;
+
+ if (s->n_connections >= s->max_connections) {
+ log_unit_warning(UNIT(s), "Too many incoming connections (%u), dropping connection.",
+ s->n_connections);
+ goto refuse;
+ }
+
+ if (s->max_connections_per_source > 0) {
+ r = socket_acquire_peer(s, cfd, &p);
+ if (r < 0) {
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+ /* We didn't have enough resources to acquire peer information, let's fail. */
+ goto fail;
+ }
+ if (r > 0 && p->n_ref > s->max_connections_per_source) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) sockaddr_pretty(&p->peer.sa, p->peer_salen, true, false, &t);
+
+ log_unit_warning(UNIT(s),
+ "Too many incoming connections (%u) from source %s, dropping connection.",
+ p->n_ref, strnull(t));
+ goto refuse;
+ }
+ }
+
+ r = socket_load_service_unit(s, cfd, &service);
+ if (r < 0) {
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+
+ log_unit_warning_errno(UNIT(s), r, "Failed to load connection service unit: %m");
+ goto fail;
+ }
+
+ r = unit_add_two_dependencies(UNIT(s), UNIT_BEFORE, UNIT_TRIGGERS, service,
+ false, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to add Before=/Triggers= dependencies on connection unit: %m");
+ goto fail;
+ }
+
+ s->n_accepted++;
+
+ r = service_set_socket_fd(SERVICE(service), cfd, s, p, s->selinux_context_from_net);
+ if (r < 0) {
+ if (ERRNO_IS_DISCONNECT(r))
+ return;
+
+ log_unit_warning_errno(UNIT(s), r, "Failed to set socket on service: %m");
+ goto fail;
+ }
+
+ TAKE_FD(cfd); /* We passed ownership of the fd to the service now. Forget it here. */
+ s->n_connections++;
+
+ r = manager_add_job(UNIT(s)->manager, JOB_START, service, JOB_REPLACE, NULL, &error, NULL);
+ if (r < 0) {
+ /* We failed to activate the new service, but it still exists. Let's make sure the
+ * service closes and forgets the connection fd again, immediately. */
+ service_release_socket_fd(SERVICE(service));
+ goto queue_error;
+ }
+
+ /* Notify clients about changed counters */
+ unit_add_to_dbus_queue(UNIT(s));
+ }
+
+ return;
+
+refuse:
+ s->n_refused++;
+ return;
+
+queue_error:
+ if (ERRNO_IS_RESOURCE(r))
+ log_unit_warning(UNIT(s), "Failed to queue service startup job: %s",
+ bus_error_message(&error, r));
+ else
+ log_unit_warning(UNIT(s), "Failed to queue service startup job (Maybe the service file is missing or not a %s unit?): %s",
+ cfd >= 0 ? "template" : "non-template",
+ bus_error_message(&error, r));
+
+fail:
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+}
+
+static void socket_run_next(Socket *s) {
+ int r;
+
+ assert(s);
+ assert(s->control_command);
+ assert(s->control_command->command_next);
+
+ socket_unwatch_control_pid(s);
+
+ s->control_command = s->control_command->command_next;
+
+ pidref_done(&s->control_pid);
+
+ r = socket_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn next task: %m");
+
+ if (s->state == SOCKET_START_POST)
+ socket_enter_stop_pre(s, SOCKET_FAILURE_RESOURCES);
+ else if (s->state == SOCKET_STOP_POST)
+ socket_enter_dead(s, SOCKET_FAILURE_RESOURCES);
+ else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_RESOURCES);
+ }
+}
+
+static int socket_start(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later
+ * please! */
+ if (IN_SET(s->state,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL,
+ SOCKET_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (IN_SET(s->state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST))
+ return 0;
+
+ /* Cannot run this without the service being around */
+ if (UNIT_ISSET(s->service)) {
+ Service *service;
+
+ service = SERVICE(UNIT_DEREF(s->service));
+
+ if (UNIT(service)->load_state != UNIT_LOADED)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+ "Socket service %s not loaded, refusing.", UNIT(service)->id);
+
+ /* If the service is already active we cannot start the
+ * socket */
+ if (!IN_SET(service->state,
+ SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART,
+ SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EBUSY),
+ "Socket service %s already active, refusing.", UNIT(service)->id);
+ }
+
+ assert(IN_SET(s->state, SOCKET_DEAD, SOCKET_FAILED));
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ s->result = SOCKET_SUCCESS;
+ exec_command_reset_status_list_array(s->exec_command, _SOCKET_EXEC_COMMAND_MAX);
+
+ u->reset_accounting = true;
+
+ socket_enter_start_pre(s);
+ return 1;
+}
+
+static int socket_stop(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ /* Already on it */
+ if (IN_SET(s->state,
+ SOCKET_STOP_PRE,
+ SOCKET_STOP_PRE_SIGTERM,
+ SOCKET_STOP_PRE_SIGKILL,
+ SOCKET_STOP_POST,
+ SOCKET_FINAL_SIGTERM,
+ SOCKET_FINAL_SIGKILL))
+ return 0;
+
+ /* If there's already something running we go directly into
+ * kill mode. */
+ if (IN_SET(s->state,
+ SOCKET_START_PRE,
+ SOCKET_START_CHOWN,
+ SOCKET_START_POST)) {
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_SUCCESS);
+ return -EAGAIN;
+ }
+
+ /* If we are currently cleaning, then abort it, brutally. */
+ if (s->state == SOCKET_CLEANING) {
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_SUCCESS);
+ return 0;
+ }
+
+ assert(IN_SET(s->state, SOCKET_LISTENING, SOCKET_RUNNING));
+
+ socket_enter_stop_pre(s, SOCKET_SUCCESS);
+ return 1;
+}
+
+static int socket_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", socket_state_to_string(s->state));
+ (void) serialize_item(f, "result", socket_result_to_string(s->result));
+ (void) serialize_item_format(f, "n-accepted", "%u", s->n_accepted);
+ (void) serialize_item_format(f, "n-refused", "%u", s->n_refused);
+ (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+
+ if (s->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", socket_exec_command_to_string(s->control_command_id));
+
+ LIST_FOREACH(port, p, s->ports) {
+ int copy;
+
+ if (p->fd < 0)
+ continue;
+
+ copy = fdset_put_dup(fds, p->fd);
+ if (copy < 0)
+ return log_unit_warning_errno(u, copy, "Failed to serialize socket fd: %m");
+
+ if (p->type == SOCKET_SOCKET) {
+ _cleanup_free_ char *t = NULL;
+
+ r = socket_address_print(&p->address, &t);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to format socket address: %m");
+
+ if (socket_address_family(&p->address) == AF_NETLINK)
+ (void) serialize_item_format(f, "netlink", "%i %s", copy, t);
+ else
+ (void) serialize_item_format(f, "socket", "%i %i %s", copy, p->address.type, t);
+ } else if (p->type == SOCKET_SPECIAL)
+ (void) serialize_item_format(f, "special", "%i %s", copy, p->path);
+ else if (p->type == SOCKET_MQUEUE)
+ (void) serialize_item_format(f, "mqueue", "%i %s", copy, p->path);
+ else if (p->type == SOCKET_USB_FUNCTION)
+ (void) serialize_item_format(f, "ffs", "%i %s", copy, p->path);
+ else {
+ assert(p->type == SOCKET_FIFO);
+ (void) serialize_item_format(f, "fifo", "%i %s", copy, p->path);
+ }
+ }
+
+ (void) serialize_ratelimit(f, "trigger-ratelimit", &s->trigger_limit);
+
+ return 0;
+}
+
+static int socket_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(u);
+ assert(key);
+ assert(value);
+
+ if (streq(key, "state")) {
+ SocketState state;
+
+ state = socket_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ SocketResult f;
+
+ f = socket_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SOCKET_SUCCESS)
+ s->result = f;
+
+ } else if (streq(key, "n-accepted")) {
+ unsigned k;
+
+ if (safe_atou(value, &k) < 0)
+ log_unit_debug(u, "Failed to parse n-accepted value: %s", value);
+ else
+ s->n_accepted += k;
+ } else if (streq(key, "n-refused")) {
+ unsigned k;
+
+ if (safe_atou(value, &k) < 0)
+ log_unit_debug(u, "Failed to parse n-refused value: %s", value);
+ else
+ s->n_refused += k;
+ } else if (streq(key, "control-pid")) {
+ pidref_done(&s->control_pid);
+ (void) deserialize_pidref(fds, value, &s->control_pid);
+
+ } else if (streq(key, "control-command")) {
+ SocketExecCommand id;
+
+ id = socket_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+ else {
+ s->control_command_id = id;
+ s->control_command = s->exec_command[id];
+ }
+ } else if (streq(key, "fifo")) {
+ _cleanup_free_ char *fdv = NULL;
+ bool found = false;
+ int fd;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse fifo value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid fifo value: %s", fdv);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_FIFO &&
+ path_equal_or_inode_same(p->path, value, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching fifo socket found: %s", value);
+
+ } else if (streq(key, "special")) {
+ _cleanup_free_ char *fdv = NULL;
+ bool found = false;
+ int fd;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse special value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid special value: %s", fdv);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_SPECIAL &&
+ path_equal_or_inode_same(p->path, value, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching special socket found: %s", value);
+
+ } else if (streq(key, "mqueue")) {
+ _cleanup_free_ char *fdv = NULL;
+ bool found = false;
+ int fd;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse mqueue value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid mqueue value: %s", fdv);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_MQUEUE &&
+ streq(p->path, value)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching mqueue socket found: %s", value);
+
+ } else if (streq(key, "socket")) {
+ _cleanup_free_ char *fdv = NULL, *typev = NULL;
+ bool found = false;
+ int fd, type;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse socket fd from value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid socket fd: %s", fdv);
+ return 0;
+ }
+
+ r = extract_first_word(&value, &typev, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse socket type from value: %s", value);
+ return 0;
+ }
+
+ if (safe_atoi(typev, &type) < 0 || type < 0) {
+ log_unit_debug(u, "Invalid socket type: %s", typev);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ socket_address_is(&p->address, value, type)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching %s socket found: %s",
+ socket_address_type_to_string(type), value);
+
+ } else if (streq(key, "netlink")) {
+ _cleanup_free_ char *fdv = NULL;
+ bool found = false;
+ int fd;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse socket value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid socket value: %s", fdv);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ socket_address_is_netlink(&p->address, value)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching netlink socket found: %s", value);
+
+ } else if (streq(key, "ffs")) {
+ _cleanup_free_ char *fdv = NULL;
+ bool found = false;
+ int fd;
+
+ r = extract_first_word(&value, &fdv, NULL, 0);
+ if (r <= 0) {
+ log_unit_debug(u, "Failed to parse ffs value: %s", value);
+ return 0;
+ }
+
+ fd = parse_fd(fdv);
+ if (fd < 0 || !fdset_contains(fds, fd)) {
+ log_unit_debug(u, "Invalid ffs value: %s", fdv);
+ return 0;
+ }
+
+ LIST_FOREACH(port, p, s->ports)
+ if (p->fd < 0 &&
+ p->type == SOCKET_USB_FUNCTION &&
+ path_equal_or_inode_same(p->path, value, 0)) {
+ p->fd = fdset_remove(fds, fd);
+ found = true;
+ break;
+ }
+ if (!found)
+ log_unit_debug(u, "No matching ffs socket found: %s", value);
+
+ } else if (streq(key, "trigger-ratelimit"))
+ deserialize_ratelimit(&s->trigger_limit, key, value);
+
+ else
+ log_unit_debug(UNIT(s), "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void socket_distribute_fds(Unit *u, FDSet *fds) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+
+ LIST_FOREACH(port, p, s->ports) {
+ int fd;
+
+ if (p->type != SOCKET_SOCKET)
+ continue;
+
+ if (p->fd >= 0)
+ continue;
+
+ FDSET_FOREACH(fd, fds) {
+ if (socket_address_matches_fd(&p->address, fd)) {
+ p->fd = fdset_remove(fds, fd);
+ s->deserialized_state = SOCKET_LISTENING;
+ break;
+ }
+ }
+ }
+}
+
+static UnitActiveState socket_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SOCKET(u)->state];
+}
+
+static const char *socket_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return socket_state_to_string(SOCKET(u)->state);
+}
+
+int socket_port_to_address(const SocketPort *p, char **ret) {
+ _cleanup_free_ char *address = NULL;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ switch (p->type) {
+ case SOCKET_SOCKET: {
+ r = socket_address_print(&p->address, &address);
+ if (r < 0)
+ return r;
+
+ break;
+ }
+
+ case SOCKET_SPECIAL:
+ case SOCKET_MQUEUE:
+ case SOCKET_FIFO:
+ case SOCKET_USB_FUNCTION:
+ address = strdup(p->path);
+ if (!address)
+ return -ENOMEM;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ *ret = TAKE_PTR(address);
+
+ return 0;
+}
+
+const char* socket_port_type_to_string(SocketPort *p) {
+
+ assert(p);
+
+ switch (p->type) {
+
+ case SOCKET_SOCKET:
+
+ switch (p->address.type) {
+
+ case SOCK_STREAM:
+ return "Stream";
+
+ case SOCK_DGRAM:
+ return "Datagram";
+
+ case SOCK_SEQPACKET:
+ return "SequentialPacket";
+
+ case SOCK_RAW:
+ if (socket_address_family(&p->address) == AF_NETLINK)
+ return "Netlink";
+
+ _fallthrough_;
+ default:
+ return NULL;
+ }
+
+ case SOCKET_SPECIAL:
+ return "Special";
+
+ case SOCKET_MQUEUE:
+ return "MessageQueue";
+
+ case SOCKET_FIFO:
+ return "FIFO";
+
+ case SOCKET_USB_FUNCTION:
+ return "USBFunction";
+
+ default:
+ return NULL;
+ }
+}
+
+SocketType socket_port_type_from_string(const char *s) {
+ assert(s);
+
+ if (STR_IN_SET(s, "Stream", "Datagram", "SequentialPacket", "Netlink"))
+ return SOCKET_SOCKET;
+ else if (streq(s, "Special"))
+ return SOCKET_SPECIAL;
+ else if (streq(s, "MessageQueue"))
+ return SOCKET_MQUEUE;
+ else if (streq(s, "FIFO"))
+ return SOCKET_FIFO;
+ else if (streq(s, "USBFunction"))
+ return SOCKET_USB_FUNCTION;
+ else
+ return _SOCKET_TYPE_INVALID;
+}
+
+static bool socket_may_gc(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+
+ return s->n_connections == 0;
+}
+
+static int socket_accept_do(Socket *s, int fd) {
+ int cfd;
+
+ assert(s);
+ assert(fd >= 0);
+
+ cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+ if (cfd < 0)
+ /* Convert transient network errors into clean and well-defined EAGAIN */
+ return ERRNO_IS_ACCEPT_AGAIN(errno) ? -EAGAIN : -errno;
+
+ return cfd;
+}
+
+static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) {
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+ _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+ int cfd, r;
+
+ assert(s);
+ assert(p);
+ assert(fd >= 0);
+
+ /* Similar to socket_address_listen_in_cgroup(), but for accept() rather than socket(): make sure that any
+ * connection socket is also properly associated with the cgroup. */
+
+ if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6))
+ goto shortcut;
+
+ r = bpf_firewall_supported();
+ if (r < 0)
+ return r;
+ if (r == BPF_FIREWALL_UNSUPPORTED)
+ goto shortcut;
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0)
+ return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m");
+
+ r = unit_fork_helper_process(UNIT(s), "(sd-accept)", &pid);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m");
+ if (r == 0) {
+ /* Child */
+
+ pair[0] = safe_close(pair[0]);
+
+ cfd = socket_accept_do(s, fd);
+ if (cfd == -EAGAIN) /* spurious accept() */
+ _exit(EXIT_SUCCESS);
+ if (cfd < 0) {
+ log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ r = send_one_fd(pair[1], cfd, 0);
+ if (r < 0) {
+ log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pair[1] = safe_close(pair[1]);
+ cfd = receive_one_fd(pair[0], 0);
+
+ /* We synchronously wait for the helper, as it shouldn't be slow */
+ r = wait_for_terminate_and_check("(sd-accept)", pid.pid, WAIT_LOG_ABNORMAL);
+ if (r < 0) {
+ safe_close(cfd);
+ return r;
+ }
+
+ /* If we received no fd, we got EIO here. If this happens with a process exit code of EXIT_SUCCESS
+ * this is a spurious accept(), let's convert that back to EAGAIN here. */
+ if (cfd == -EIO)
+ return -EAGAIN;
+ if (cfd < 0)
+ return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m");
+
+ return cfd;
+
+shortcut:
+ cfd = socket_accept_do(s, fd);
+ if (cfd == -EAGAIN) /* spurious accept(), skip it silently */
+ return -EAGAIN;
+ if (cfd < 0)
+ return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m");
+
+ return cfd;
+}
+
+static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ SocketPort *p = ASSERT_PTR(userdata);
+ int cfd = -EBADF;
+
+ assert(fd >= 0);
+
+ if (p->socket->state != SOCKET_LISTENING)
+ return 0;
+
+ log_unit_debug(UNIT(p->socket), "Incoming traffic");
+
+ if (revents != EPOLLIN) {
+ if (revents & EPOLLHUP)
+ log_unit_error(UNIT(p->socket), "Got POLLHUP on a listening socket. The service probably invoked shutdown() on it, and should better not do that.");
+ else
+ log_unit_error(UNIT(p->socket), "Got unexpected poll event (0x%x) on socket.", revents);
+ goto fail;
+ }
+
+ if (p->socket->accept &&
+ p->type == SOCKET_SOCKET &&
+ socket_address_can_accept(&p->address)) {
+
+ cfd = socket_accept_in_cgroup(p->socket, p, fd);
+ if (cfd == -EAGAIN) /* Spurious accept() */
+ return 0;
+ if (cfd < 0)
+ goto fail;
+
+ socket_apply_socket_options(p->socket, p, cfd);
+ }
+
+ socket_enter_running(p->socket, cfd);
+ return 0;
+
+fail:
+ socket_enter_stop_pre(p->socket, SOCKET_FAILURE_RESOURCES);
+ return 0;
+}
+
+static void socket_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Socket *s = SOCKET(u);
+ SocketResult f;
+
+ assert(s);
+ assert(pid >= 0);
+
+ if (pid != s->control_pid.pid)
+ return;
+
+ pidref_done(&s->control_pid);
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = SOCKET_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SOCKET_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SOCKET_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SOCKET_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ if (s->control_command->flags & EXEC_COMMAND_IGNORE_FAILURE)
+ f = SOCKET_SUCCESS;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Control process",
+ socket_exec_command_to_string(s->control_command_id),
+ f == SOCKET_SUCCESS,
+ code, status);
+
+ if (s->result == SOCKET_SUCCESS)
+ s->result = f;
+
+ if (s->control_command &&
+ s->control_command->command_next &&
+ f == SOCKET_SUCCESS) {
+
+ log_unit_debug(u, "Running next command for state %s", socket_state_to_string(s->state));
+ socket_run_next(s);
+ } else {
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ /* No further commands for this step, so let's figure
+ * out what to do next */
+
+ log_unit_debug(u, "Got final SIGCHLD for state %s", socket_state_to_string(s->state));
+
+ switch (s->state) {
+
+ case SOCKET_START_PRE:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_start_chown(s);
+ else
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, f);
+ break;
+
+ case SOCKET_START_CHOWN:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_start_post(s);
+ else
+ socket_enter_stop_pre(s, f);
+ break;
+
+ case SOCKET_START_POST:
+ if (f == SOCKET_SUCCESS)
+ socket_enter_listening(s);
+ else
+ socket_enter_stop_pre(s, f);
+ break;
+
+ case SOCKET_STOP_PRE:
+ case SOCKET_STOP_PRE_SIGTERM:
+ case SOCKET_STOP_PRE_SIGKILL:
+ socket_enter_stop_post(s, f);
+ break;
+
+ case SOCKET_STOP_POST:
+ case SOCKET_FINAL_SIGTERM:
+ case SOCKET_FINAL_SIGKILL:
+ socket_enter_dead(s, f);
+ break;
+
+ case SOCKET_CLEANING:
+
+ if (s->clean_result == SOCKET_SUCCESS)
+ s->clean_result = f;
+
+ socket_enter_dead(s, SOCKET_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int socket_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Socket *s = SOCKET(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SOCKET_START_PRE:
+ log_unit_warning(UNIT(s), "Starting timed out. Terminating.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_START_CHOWN:
+ case SOCKET_START_POST:
+ log_unit_warning(UNIT(s), "Starting timed out. Stopping.");
+ socket_enter_stop_pre(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_PRE:
+ log_unit_warning(UNIT(s), "Stopping timed out. Terminating.");
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_PRE_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
+ socket_enter_signal(s, SOCKET_STOP_PRE_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL. Ignoring.");
+ socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SOCKET_STOP_PRE_SIGKILL:
+ log_unit_warning(UNIT(s), "Processes still around after SIGKILL. Ignoring.");
+ socket_enter_stop_post(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_STOP_POST:
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Terminating.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGTERM, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_FINAL_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Killing.");
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, SOCKET_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Stopping timed out (2). Skipping SIGKILL. Ignoring.");
+ socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SOCKET_FINAL_SIGKILL:
+ log_unit_warning(UNIT(s), "Still around after SIGKILL (2). Entering failed mode.");
+ socket_enter_dead(s, SOCKET_FAILURE_TIMEOUT);
+ break;
+
+ case SOCKET_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SOCKET_SUCCESS)
+ s->clean_result = SOCKET_FAILURE_TIMEOUT;
+
+ socket_enter_signal(s, SOCKET_FINAL_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int socket_collect_fds(Socket *s, int **fds) {
+ size_t k = 0, n = 0;
+ int *rfds;
+
+ assert(s);
+ assert(fds);
+
+ /* Called from the service code for requesting our fds */
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd >= 0)
+ n++;
+ n += p->n_auxiliary_fds;
+ }
+
+ if (n <= 0) {
+ *fds = NULL;
+ return 0;
+ }
+
+ rfds = new(int, n);
+ if (!rfds)
+ return -ENOMEM;
+
+ LIST_FOREACH(port, p, s->ports) {
+ if (p->fd >= 0)
+ rfds[k++] = p->fd;
+ for (size_t i = 0; i < p->n_auxiliary_fds; ++i)
+ rfds[k++] = p->auxiliary_fds[i];
+ }
+
+ assert(k == n);
+
+ *fds = rfds;
+ return (int) n;
+}
+
+static void socket_reset_failed(Unit *u) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ if (s->state == SOCKET_FAILED)
+ socket_set_state(s, SOCKET_DEAD);
+
+ s->result = SOCKET_SUCCESS;
+ s->clean_result = SOCKET_SUCCESS;
+}
+
+void socket_connection_unref(Socket *s) {
+ assert(s);
+
+ /* The service is dead. Yay!
+ *
+ * This is strictly for one-instance-per-connection
+ * services. */
+
+ assert(s->n_connections > 0);
+ s->n_connections--;
+
+ log_unit_debug(UNIT(s), "One connection closed, %u left.", s->n_connections);
+}
+
+static void socket_trigger_notify(Unit *u, Unit *other) {
+ Socket *s = SOCKET(u);
+
+ assert(u);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+ assert(other->type == UNIT_SERVICE);
+
+ /* Don't propagate state changes from the service if we are already down */
+ if (!IN_SET(s->state, SOCKET_RUNNING, SOCKET_LISTENING))
+ return;
+
+ /* We don't care for the service state if we are in Accept=yes mode */
+ if (s->accept)
+ return;
+
+ /* Propagate start limit hit state */
+ if (other->start_limit_hit) {
+ socket_enter_stop_pre(s, SOCKET_FAILURE_SERVICE_START_LIMIT_HIT);
+ return;
+ }
+
+ /* Don't propagate anything if there's still a job queued */
+ if (other->job)
+ return;
+
+ if (IN_SET(SERVICE(other)->state,
+ SERVICE_DEAD, SERVICE_DEAD_BEFORE_AUTO_RESTART, SERVICE_FAILED, SERVICE_FAILED_BEFORE_AUTO_RESTART,
+ SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
+ SERVICE_AUTO_RESTART, SERVICE_AUTO_RESTART_QUEUED))
+ socket_enter_listening(s);
+
+ if (SERVICE(other)->state == SERVICE_RUNNING)
+ socket_set_state(s, SOCKET_RUNNING);
+}
+
+static int socket_get_timeout(Unit *u, usec_t *timeout) {
+ Socket *s = SOCKET(u);
+ usec_t t;
+ int r;
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+char *socket_fdname(Socket *s) {
+ assert(s);
+
+ /* Returns the name to use for $LISTEN_NAMES. If the user
+ * didn't specify anything specifically, use the socket unit's
+ * name as fallback. */
+
+ return s->fdname ?: UNIT(s)->id;
+}
+
+static PidRef *socket_control_pid(Unit *u) {
+ return &ASSERT_PTR(SOCKET(u))->control_pid;
+}
+
+static int socket_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (s->state != SOCKET_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ socket_unwatch_control_pid(s);
+ s->clean_result = SOCKET_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SOCKET_EXEC_COMMAND_INVALID;
+
+ r = socket_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+ goto fail;
+ }
+
+ socket_set_state(s, SOCKET_CLEANING);
+ return 0;
+
+fail:
+ s->clean_result = SOCKET_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int socket_can_clean(Unit *u, ExecCleanMask *ret) {
+ Socket *s = SOCKET(u);
+
+ assert(s);
+
+ return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int socket_can_start(Unit *u) {
+ Socket *s = SOCKET(u);
+ int r;
+
+ assert(s);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ socket_enter_dead(s, SOCKET_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = {
+ [SOCKET_EXEC_START_PRE] = "ExecStartPre",
+ [SOCKET_EXEC_START_CHOWN] = "ExecStartChown",
+ [SOCKET_EXEC_START_POST] = "ExecStartPost",
+ [SOCKET_EXEC_STOP_PRE] = "ExecStopPre",
+ [SOCKET_EXEC_STOP_POST] = "ExecStopPost"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_exec_command, SocketExecCommand);
+
+static const char* const socket_result_table[_SOCKET_RESULT_MAX] = {
+ [SOCKET_SUCCESS] = "success",
+ [SOCKET_FAILURE_RESOURCES] = "resources",
+ [SOCKET_FAILURE_TIMEOUT] = "timeout",
+ [SOCKET_FAILURE_EXIT_CODE] = "exit-code",
+ [SOCKET_FAILURE_SIGNAL] = "signal",
+ [SOCKET_FAILURE_CORE_DUMP] = "core-dump",
+ [SOCKET_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+ [SOCKET_FAILURE_TRIGGER_LIMIT_HIT] = "trigger-limit-hit",
+ [SOCKET_FAILURE_SERVICE_START_LIMIT_HIT] = "service-start-limit-hit"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_result, SocketResult);
+
+static const char* const socket_timestamping_table[_SOCKET_TIMESTAMPING_MAX] = {
+ [SOCKET_TIMESTAMPING_OFF] = "off",
+ [SOCKET_TIMESTAMPING_US] = "us",
+ [SOCKET_TIMESTAMPING_NS] = "ns",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(socket_timestamping, SocketTimestamping);
+
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) {
+ SocketTimestamping t;
+ int r;
+
+ if (!p)
+ return _SOCKET_TIMESTAMPING_INVALID;
+
+ t = socket_timestamping_from_string(p);
+ if (t >= 0)
+ return t;
+
+ /* Let's alternatively support the various other aliases parse_time() accepts for ns and µs here,
+ * too. */
+ if (streq(p, "nsec"))
+ return SOCKET_TIMESTAMPING_NS;
+ if (STR_IN_SET(p, "usec", "µs", "μs")) /* Accept both small greek letter mu + micro sign unicode codepoints */
+ return SOCKET_TIMESTAMPING_US;
+
+ r = parse_boolean(p);
+ if (r < 0)
+ return _SOCKET_TIMESTAMPING_INVALID;
+
+ return r ? SOCKET_TIMESTAMPING_NS : SOCKET_TIMESTAMPING_OFF; /* If boolean yes, default to ns accuracy */
+}
+
+const UnitVTable socket_vtable = {
+ .object_size = sizeof(Socket),
+ .exec_context_offset = offsetof(Socket, exec_context),
+ .cgroup_context_offset = offsetof(Socket, cgroup_context),
+ .kill_context_offset = offsetof(Socket, kill_context),
+ .exec_runtime_offset = offsetof(Socket, exec_runtime),
+
+ .sections =
+ "Unit\0"
+ "Socket\0"
+ "Install\0",
+ .private_section = "Socket",
+
+ .can_transient = true,
+ .can_trigger = true,
+ .can_fail = true,
+
+ .init = socket_init,
+ .done = socket_done,
+ .load = socket_load,
+
+ .coldplug = socket_coldplug,
+
+ .dump = socket_dump,
+
+ .start = socket_start,
+ .stop = socket_stop,
+
+ .clean = socket_clean,
+ .can_clean = socket_can_clean,
+
+ .get_timeout = socket_get_timeout,
+
+ .serialize = socket_serialize,
+ .deserialize_item = socket_deserialize_item,
+ .distribute_fds = socket_distribute_fds,
+
+ .active_state = socket_active_state,
+ .sub_state_to_string = socket_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = socket_may_gc,
+
+ .sigchld_event = socket_sigchld_event,
+
+ .trigger_notify = socket_trigger_notify,
+
+ .reset_failed = socket_reset_failed,
+
+ .control_pid = socket_control_pid,
+
+ .bus_set_property = bus_socket_set_property,
+ .bus_commit_properties = bus_socket_commit_properties,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Listening on %s.",
+ [JOB_FAILED] = "Failed to listen on %s.",
+ [JOB_TIMEOUT] = "Timed out starting %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Closed %s.",
+ [JOB_FAILED] = "Failed stopping %s.",
+ [JOB_TIMEOUT] = "Timed out stopping %s.",
+ },
+ },
+
+ .can_start = socket_can_start,
+};
diff --git a/src/core/socket.h b/src/core/socket.h
new file mode 100644
index 0000000..0983e8c
--- /dev/null
+++ b/src/core/socket.h
@@ -0,0 +1,204 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Socket Socket;
+typedef struct SocketPeer SocketPeer;
+
+#include "mount.h"
+#include "pidref.h"
+#include "socket-util.h"
+#include "unit.h"
+
+typedef enum SocketExecCommand {
+ SOCKET_EXEC_START_PRE,
+ SOCKET_EXEC_START_CHOWN,
+ SOCKET_EXEC_START_POST,
+ SOCKET_EXEC_STOP_PRE,
+ SOCKET_EXEC_STOP_POST,
+ _SOCKET_EXEC_COMMAND_MAX,
+ _SOCKET_EXEC_COMMAND_INVALID = -EINVAL,
+} SocketExecCommand;
+
+typedef enum SocketType {
+ SOCKET_SOCKET,
+ SOCKET_FIFO,
+ SOCKET_SPECIAL,
+ SOCKET_MQUEUE,
+ SOCKET_USB_FUNCTION,
+ _SOCKET_TYPE_MAX,
+ _SOCKET_TYPE_INVALID = -EINVAL,
+} SocketType;
+
+typedef enum SocketResult {
+ SOCKET_SUCCESS,
+ SOCKET_FAILURE_RESOURCES,
+ SOCKET_FAILURE_TIMEOUT,
+ SOCKET_FAILURE_EXIT_CODE,
+ SOCKET_FAILURE_SIGNAL,
+ SOCKET_FAILURE_CORE_DUMP,
+ SOCKET_FAILURE_START_LIMIT_HIT,
+ SOCKET_FAILURE_TRIGGER_LIMIT_HIT,
+ SOCKET_FAILURE_SERVICE_START_LIMIT_HIT,
+ _SOCKET_RESULT_MAX,
+ _SOCKET_RESULT_INVALID = -EINVAL,
+} SocketResult;
+
+typedef struct SocketPort {
+ Socket *socket;
+
+ SocketType type;
+ int fd;
+ int *auxiliary_fds;
+ size_t n_auxiliary_fds;
+
+ SocketAddress address;
+ char *path;
+ sd_event_source *event_source;
+
+ LIST_FIELDS(struct SocketPort, port);
+} SocketPort;
+
+typedef enum SocketTimestamping {
+ SOCKET_TIMESTAMPING_OFF,
+ SOCKET_TIMESTAMPING_US, /* SO_TIMESTAMP */
+ SOCKET_TIMESTAMPING_NS, /* SO_TIMESTAMPNS */
+ _SOCKET_TIMESTAMPING_MAX,
+ _SOCKET_TIMESTAMPING_INVALID = -EINVAL,
+} SocketTimestamping;
+
+struct Socket {
+ Unit meta;
+
+ LIST_HEAD(SocketPort, ports);
+
+ Set *peers_by_address;
+
+ unsigned n_accepted;
+ unsigned n_connections;
+ unsigned n_refused;
+ unsigned max_connections;
+ unsigned max_connections_per_source;
+
+ unsigned backlog;
+ unsigned keep_alive_cnt;
+ usec_t timeout_usec;
+ usec_t keep_alive_time;
+ usec_t keep_alive_interval;
+ usec_t defer_accept;
+
+ ExecCommand* exec_command[_SOCKET_EXEC_COMMAND_MAX];
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+
+ /* For Accept=no sockets refers to the one service we'll
+ * activate. For Accept=yes sockets is either NULL, or filled
+ * to refer to the next service we spawn. */
+ UnitRef service;
+
+ SocketState state, deserialized_state;
+
+ sd_event_source *timer_event_source;
+
+ ExecCommand* control_command;
+ SocketExecCommand control_command_id;
+ PidRef control_pid;
+
+ mode_t directory_mode;
+ mode_t socket_mode;
+
+ SocketResult result;
+ SocketResult clean_result;
+
+ char **symlinks;
+
+ bool accept;
+ bool remove_on_stop;
+ bool writable;
+ bool flush_pending;
+
+ int socket_protocol;
+
+ /* Socket options */
+ bool keep_alive;
+ bool no_delay;
+ bool free_bind;
+ bool transparent;
+ bool broadcast;
+ bool pass_cred;
+ bool pass_sec;
+ bool pass_pktinfo;
+ SocketTimestamping timestamping;
+
+ /* Only for INET6 sockets: issue IPV6_V6ONLY sockopt */
+ SocketAddressBindIPv6Only bind_ipv6_only;
+
+ int priority;
+ int mark;
+ size_t receive_buffer;
+ size_t send_buffer;
+ int ip_tos;
+ int ip_ttl;
+ size_t pipe_size;
+ char *bind_to_device;
+ char *tcp_congestion;
+ bool reuse_port;
+ long mq_maxmsg;
+ long mq_msgsize;
+
+ char *smack;
+ char *smack_ip_in;
+ char *smack_ip_out;
+
+ bool selinux_context_from_net;
+
+ char *user, *group;
+
+ char *fdname;
+
+ RateLimit trigger_limit;
+ usec_t poll_limit_interval;
+ unsigned poll_limit_burst;
+};
+
+SocketPeer *socket_peer_ref(SocketPeer *p);
+SocketPeer *socket_peer_unref(SocketPeer *p);
+int socket_acquire_peer(Socket *s, int fd, SocketPeer **p);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPeer*, socket_peer_unref);
+
+/* Called from the service code when collecting fds */
+int socket_collect_fds(Socket *s, int **fds);
+
+/* Called from the service code when a per-connection service ended */
+void socket_connection_unref(Socket *s);
+
+SocketPort *socket_port_free(SocketPort *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(SocketPort*, socket_port_free);
+
+void socket_free_ports(Socket *s);
+
+int socket_port_to_address(const SocketPort *s, char **ret);
+
+int socket_load_service_unit(Socket *s, int cfd, Unit **ret);
+
+char *socket_fdname(Socket *s);
+
+extern const UnitVTable socket_vtable;
+
+const char* socket_exec_command_to_string(SocketExecCommand i) _const_;
+SocketExecCommand socket_exec_command_from_string(const char *s) _pure_;
+
+const char* socket_result_to_string(SocketResult i) _const_;
+SocketResult socket_result_from_string(const char *s) _pure_;
+
+const char* socket_port_type_to_string(SocketPort *p) _pure_;
+SocketType socket_port_type_from_string(const char *p) _pure_;
+
+const char* socket_timestamping_to_string(SocketTimestamping p) _const_;
+SocketTimestamping socket_timestamping_from_string(const char *p) _pure_;
+SocketTimestamping socket_timestamping_from_string_harder(const char *p) _pure_;
+
+DEFINE_CAST(SOCKET, Socket);
diff --git a/src/core/swap.c b/src/core/swap.c
new file mode 100644
index 0000000..488b171
--- /dev/null
+++ b/src/core/swap.c
@@ -0,0 +1,1680 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "dbus-swap.h"
+#include "dbus-unit.h"
+#include "device-util.h"
+#include "device.h"
+#include "escape.h"
+#include "exit-status.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fstab-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "swap.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_SWAP_STATE_MAX] = {
+ [SWAP_DEAD] = UNIT_INACTIVE,
+ [SWAP_ACTIVATING] = UNIT_ACTIVATING,
+ [SWAP_ACTIVATING_DONE] = UNIT_ACTIVE,
+ [SWAP_ACTIVE] = UNIT_ACTIVE,
+ [SWAP_DEACTIVATING] = UNIT_DEACTIVATING,
+ [SWAP_DEACTIVATING_SIGTERM] = UNIT_DEACTIVATING,
+ [SWAP_DEACTIVATING_SIGKILL] = UNIT_DEACTIVATING,
+ [SWAP_FAILED] = UNIT_FAILED,
+ [SWAP_CLEANING] = UNIT_MAINTENANCE,
+};
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int swap_process_proc_swaps(Manager *m);
+
+static bool SWAP_STATE_WITH_PROCESS(SwapState state) {
+ return IN_SET(state,
+ SWAP_ACTIVATING,
+ SWAP_ACTIVATING_DONE,
+ SWAP_DEACTIVATING,
+ SWAP_DEACTIVATING_SIGTERM,
+ SWAP_DEACTIVATING_SIGKILL,
+ SWAP_CLEANING);
+}
+
+static UnitActiveState swap_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[SWAP(u)->state];
+}
+
+static const char *swap_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return swap_state_to_string(SWAP(u)->state);
+}
+
+static bool swap_may_gc(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ if (s->from_proc_swaps)
+ return false;
+
+ return true;
+}
+
+static bool swap_is_extrinsic(Unit *u) {
+ assert(SWAP(u));
+
+ return MANAGER_IS_USER(u->manager);
+}
+
+static void swap_unset_proc_swaps(Swap *s) {
+ assert(s);
+
+ if (!s->from_proc_swaps)
+ return;
+
+ s->parameters_proc_swaps.what = mfree(s->parameters_proc_swaps.what);
+ s->from_proc_swaps = false;
+}
+
+static int swap_set_devnode(Swap *s, const char *devnode) {
+ Hashmap *swaps;
+ Swap *first;
+ int r;
+
+ assert(s);
+
+ r = hashmap_ensure_allocated(&UNIT(s)->manager->swaps_by_devnode, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ swaps = UNIT(s)->manager->swaps_by_devnode;
+
+ if (s->devnode) {
+ first = hashmap_get(swaps, s->devnode);
+
+ LIST_REMOVE(same_devnode, first, s);
+ if (first)
+ hashmap_replace(swaps, first->devnode, first);
+ else
+ hashmap_remove(swaps, s->devnode);
+
+ s->devnode = mfree(s->devnode);
+ }
+
+ if (devnode) {
+ s->devnode = strdup(devnode);
+ if (!s->devnode)
+ return -ENOMEM;
+
+ first = hashmap_get(swaps, s->devnode);
+ LIST_PREPEND(same_devnode, first, s);
+
+ return hashmap_replace(swaps, first->devnode, first);
+ }
+
+ return 0;
+}
+
+static void swap_init(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(UNIT(s)->load_state == UNIT_STUB);
+
+ s->timeout_usec = u->manager->defaults.timeout_start_usec;
+
+ s->exec_context.std_output = u->manager->defaults.std_output;
+ s->exec_context.std_error = u->manager->defaults.std_error;
+
+ s->control_pid = PIDREF_NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+ u->ignore_on_isolate = true;
+}
+
+static void swap_unwatch_control_pid(Swap *s) {
+ assert(s);
+
+ if (!pidref_is_set(&s->control_pid))
+ return;
+
+ unit_unwatch_pidref(UNIT(s), &s->control_pid);
+ pidref_done(&s->control_pid);
+}
+
+static void swap_done(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ swap_unset_proc_swaps(s);
+ swap_set_devnode(s, NULL);
+
+ s->what = mfree(s->what);
+ s->parameters_fragment.what = mfree(s->parameters_fragment.what);
+ s->parameters_fragment.options = mfree(s->parameters_fragment.options);
+
+ s->exec_runtime = exec_runtime_free(s->exec_runtime);
+ exec_command_done_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+ s->control_command = NULL;
+
+ swap_unwatch_control_pid(s);
+
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+}
+
+static int swap_arm_timer(Swap *s, bool relative, usec_t usec) {
+ assert(s);
+
+ return unit_arm_timer(UNIT(s), &s->timer_event_source, relative, usec, swap_dispatch_timer);
+}
+
+static SwapParameters* swap_get_parameters(Swap *s) {
+ assert(s);
+
+ if (s->from_proc_swaps)
+ return &s->parameters_proc_swaps;
+
+ if (s->from_fragment)
+ return &s->parameters_fragment;
+
+ return NULL;
+}
+
+static int swap_add_device_dependencies(Swap *s) {
+ UnitDependencyMask mask;
+ SwapParameters *p;
+ int r;
+
+ assert(s);
+
+ if (!s->what)
+ return 0;
+
+ p = swap_get_parameters(s);
+ if (!p || !p->what)
+ return 0;
+
+ mask = s->from_proc_swaps ? UNIT_DEPENDENCY_PROC_SWAP : UNIT_DEPENDENCY_FILE;
+
+ if (is_device_path(p->what)) {
+ r = unit_add_node_dependency(UNIT(s), p->what, UNIT_REQUIRES, mask);
+ if (r < 0)
+ return r;
+
+ return unit_add_blockdev_dependency(UNIT(s), p->what, mask);
+ }
+
+ /* File based swap devices need to be ordered after systemd-remount-fs.service, since they might need
+ * a writable file system. */
+ return unit_add_dependency_by_name(UNIT(s), UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, mask);
+}
+
+static int swap_add_default_dependencies(Swap *s) {
+ int r;
+
+ assert(s);
+
+ if (!UNIT(s)->default_dependencies)
+ return 0;
+
+ if (!MANAGER_IS_SYSTEM(UNIT(s)->manager))
+ return 0;
+
+ if (detect_container() > 0)
+ return 0;
+
+ /* swap units generated for the swap dev links are missing the
+ * ordering dep against the swap target. */
+ r = unit_add_dependency_by_name(UNIT(s), UNIT_BEFORE, SPECIAL_SWAP_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int swap_verify(Swap *s) {
+ _cleanup_free_ char *e = NULL;
+ int r;
+
+ assert(UNIT(s)->load_state == UNIT_LOADED);
+
+ r = unit_name_from_path(s->what, ".swap", &e);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(s), r, "Failed to generate unit name from path: %m");
+
+ if (!unit_has_name(UNIT(s), e))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Value of What= and unit name do not match, not loading.");
+
+ if (s->exec_context.pam_name && s->kill_context.kill_mode != KILL_CONTROL_GROUP)
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Unit has PAM enabled. Kill mode must be set to 'control-group'. Refusing to load.");
+
+ return 0;
+}
+
+static int swap_load_devnode(Swap *s) {
+ _cleanup_free_ char *p = NULL;
+ struct stat st;
+ int r;
+
+ assert(s);
+
+ if (stat(s->what, &st) < 0 || !S_ISBLK(st.st_mode))
+ return 0;
+
+ r = devname_from_stat_rdev(&st, &p);
+ if (r < 0) {
+ log_unit_full_errno(UNIT(s), r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to get device node for swap %s: %m", s->what);
+ return 0;
+ }
+
+ return swap_set_devnode(s, p);
+}
+
+static int swap_add_extras(Swap *s) {
+ int r;
+
+ assert(s);
+
+ if (UNIT(s)->fragment_path)
+ s->from_fragment = true;
+
+ if (!s->what) {
+ if (s->parameters_fragment.what)
+ s->what = strdup(s->parameters_fragment.what);
+ else if (s->parameters_proc_swaps.what)
+ s->what = strdup(s->parameters_proc_swaps.what);
+ else {
+ r = unit_name_to_path(UNIT(s)->id, &s->what);
+ if (r < 0)
+ return r;
+ }
+
+ if (!s->what)
+ return -ENOMEM;
+ }
+
+ path_simplify(s->what);
+
+ if (!UNIT(s)->description) {
+ r = unit_set_description(UNIT(s), s->what);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_require_mounts_for(UNIT(s), s->what, UNIT_DEPENDENCY_IMPLICIT);
+ if (r < 0)
+ return r;
+
+ r = swap_add_device_dependencies(s);
+ if (r < 0)
+ return r;
+
+ r = swap_load_devnode(s);
+ if (r < 0)
+ return r;
+
+ r = unit_patch_contexts(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = unit_add_exec_dependencies(UNIT(s), &s->exec_context);
+ if (r < 0)
+ return r;
+
+ r = unit_set_default_slice(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = swap_add_default_dependencies(s);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int swap_load(Unit *u) {
+ Swap *s = SWAP(u);
+ int r, q = 0;
+
+ assert(s);
+ assert(u->load_state == UNIT_STUB);
+
+ /* Load a .swap file */
+ bool fragment_optional = s->from_proc_swaps;
+ r = unit_load_fragment_and_dropin(u, !fragment_optional);
+
+ /* Add in some extras, and do so either when we successfully loaded something or when /proc/swaps is
+ * already active. */
+ if (u->load_state == UNIT_LOADED || s->from_proc_swaps)
+ q = swap_add_extras(s);
+
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ return swap_verify(s);
+}
+
+static int swap_setup_unit(
+ Manager *m,
+ const char *what,
+ const char *what_proc_swaps,
+ int priority,
+ bool set_flags) {
+
+ _cleanup_free_ char *e = NULL;
+ bool delete = false;
+ Unit *u = NULL;
+ int r;
+ SwapParameters *p;
+
+ assert(m);
+ assert(what);
+ assert(what_proc_swaps);
+
+ r = unit_name_from_path(what, ".swap", &e);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate unit name from path: %m");
+
+ u = manager_get_unit(m, e);
+ if (u &&
+ SWAP(u)->from_proc_swaps &&
+ !path_equal(SWAP(u)->parameters_proc_swaps.what, what_proc_swaps))
+ return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
+ "Swap %s appeared twice with different device paths %s and %s",
+ e, SWAP(u)->parameters_proc_swaps.what, what_proc_swaps);
+
+ if (!u) {
+ delete = true;
+
+ r = unit_new_for_name(m, sizeof(Swap), e, &u);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to load swap unit: %m");
+ goto fail;
+ }
+
+ SWAP(u)->what = strdup(what);
+ if (!SWAP(u)->what) {
+ r = log_oom();
+ goto fail;
+ }
+
+ unit_add_to_load_queue(u);
+ } else
+ delete = false;
+
+ p = &SWAP(u)->parameters_proc_swaps;
+
+ if (!p->what) {
+ p->what = strdup(what_proc_swaps);
+ if (!p->what) {
+ r = log_oom();
+ goto fail;
+ }
+ }
+
+ /* The unit is definitely around now, mark it as loaded if it was previously referenced but could not be
+ * loaded. After all we can load it now, from the data in /proc/swaps. */
+ if (IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_BAD_SETTING, UNIT_ERROR)) {
+ u->load_state = UNIT_LOADED;
+ u->load_error = 0;
+ }
+
+ if (set_flags) {
+ SWAP(u)->is_active = true;
+ SWAP(u)->just_activated = !SWAP(u)->from_proc_swaps;
+ }
+
+ SWAP(u)->from_proc_swaps = true;
+
+ p->priority = priority;
+ p->priority_set = true;
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+
+fail:
+ if (delete)
+ unit_free(u);
+
+ return r;
+}
+
+static void swap_process_new(Manager *m, const char *device, int prio, bool set_flags) {
+ _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+ const char *dn;
+ struct stat st, st_link;
+ int r;
+
+ assert(m);
+
+ if (swap_setup_unit(m, device, device, prio, set_flags) < 0)
+ return;
+
+ /* If this is a block device, then let's add duplicates for
+ * all other names of this block device */
+ if (stat(device, &st) < 0 || !S_ISBLK(st.st_mode))
+ return;
+
+ r = sd_device_new_from_stat_rdev(&d, &st);
+ if (r < 0)
+ return (void) log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to allocate device for swap %s: %m", device);
+
+ /* Add the main device node */
+ if (sd_device_get_devname(d, &dn) >= 0 && !streq(dn, device))
+ (void) swap_setup_unit(m, dn, device, prio, set_flags);
+
+ /* Add additional units for all symlinks */
+ FOREACH_DEVICE_DEVLINK(d, devlink) {
+
+ /* Don't bother with the /dev/block links */
+ if (streq(devlink, device))
+ continue;
+
+ if (path_startswith(devlink, "/dev/block/"))
+ continue;
+
+ if (stat(devlink, &st_link) >= 0 &&
+ (!S_ISBLK(st_link.st_mode) ||
+ st_link.st_rdev != st.st_rdev))
+ continue;
+
+ (void) swap_setup_unit(m, devlink, device, prio, set_flags);
+ }
+}
+
+static void swap_set_state(Swap *s, SwapState state) {
+ SwapState old_state;
+
+ assert(s);
+
+ if (s->state != state)
+ bus_unit_send_pending_change_signal(UNIT(s), false);
+
+ old_state = s->state;
+ s->state = state;
+
+ if (!SWAP_STATE_WITH_PROCESS(state)) {
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ swap_unwatch_control_pid(s);
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(s), "Changed %s -> %s", swap_state_to_string(old_state), swap_state_to_string(state));
+
+ unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+
+ /* If there other units for the same device node have a job
+ queued it might be worth checking again if it is runnable
+ now. This is necessary, since swap_start() refuses
+ operation with EAGAIN if there's already another job for
+ the same device node queued. */
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job)
+ job_add_to_run_queue(UNIT(other)->job);
+}
+
+static int swap_coldplug(Unit *u) {
+ Swap *s = SWAP(u);
+ SwapState new_state = SWAP_DEAD;
+ int r;
+
+ assert(s);
+ assert(s->state == SWAP_DEAD);
+
+ if (s->deserialized_state != s->state)
+ new_state = s->deserialized_state;
+ else if (s->from_proc_swaps)
+ new_state = SWAP_ACTIVE;
+
+ if (new_state == s->state)
+ return 0;
+
+ if (pidref_is_set(&s->control_pid) &&
+ pidref_is_unwaited(&s->control_pid) > 0 &&
+ SWAP_STATE_WITH_PROCESS(new_state)) {
+
+ r = unit_watch_pidref(UNIT(s), &s->control_pid, /* exclusive= */ false);
+ if (r < 0)
+ return r;
+
+ r = swap_arm_timer(s, /* relative= */ false, usec_add(u->state_change_timestamp.monotonic, s->timeout_usec));
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(new_state, SWAP_DEAD, SWAP_FAILED))
+ (void) unit_setup_exec_runtime(u);
+
+ swap_set_state(s, new_state);
+ return 0;
+}
+
+static void swap_dump(Unit *u, FILE *f, const char *prefix) {
+ Swap *s = SWAP(u);
+ SwapParameters *p;
+
+ assert(s);
+ assert(f);
+
+ if (s->from_proc_swaps)
+ p = &s->parameters_proc_swaps;
+ else if (s->from_fragment)
+ p = &s->parameters_fragment;
+ else
+ p = NULL;
+
+ fprintf(f,
+ "%sSwap State: %s\n"
+ "%sResult: %s\n"
+ "%sClean Result: %s\n"
+ "%sWhat: %s\n"
+ "%sFrom /proc/swaps: %s\n"
+ "%sFrom fragment: %s\n"
+ "%sExtrinsic: %s\n",
+ prefix, swap_state_to_string(s->state),
+ prefix, swap_result_to_string(s->result),
+ prefix, swap_result_to_string(s->clean_result),
+ prefix, s->what,
+ prefix, yes_no(s->from_proc_swaps),
+ prefix, yes_no(s->from_fragment),
+ prefix, yes_no(swap_is_extrinsic(u)));
+
+ if (s->devnode)
+ fprintf(f, "%sDevice Node: %s\n", prefix, s->devnode);
+
+ if (p)
+ fprintf(f,
+ "%sPriority: %i\n"
+ "%sOptions: %s\n",
+ prefix, p->priority,
+ prefix, strempty(p->options));
+
+ fprintf(f,
+ "%sTimeoutSec: %s\n",
+ prefix, FORMAT_TIMESPAN(s->timeout_usec, USEC_PER_SEC));
+
+ if (pidref_is_set(&s->control_pid))
+ fprintf(f,
+ "%sControl PID: "PID_FMT"\n",
+ prefix, s->control_pid.pid);
+
+ exec_context_dump(&s->exec_context, f, prefix);
+ kill_context_dump(&s->kill_context, f, prefix);
+ cgroup_context_dump(UNIT(s), f, prefix);
+}
+
+static int swap_spawn(Swap *s, ExecCommand *c, PidRef *ret_pid) {
+
+ _cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(
+ EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN);
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ pid_t pid;
+ int r;
+
+ assert(s);
+ assert(c);
+ assert(ret_pid);
+
+ r = unit_prepare_exec(UNIT(s));
+ if (r < 0)
+ return r;
+
+ r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec);
+ if (r < 0)
+ return r;
+
+ r = unit_set_exec_params(UNIT(s), &exec_params);
+ if (r < 0)
+ return r;
+
+ r = exec_spawn(UNIT(s),
+ c,
+ &s->exec_context,
+ &exec_params,
+ s->exec_runtime,
+ &s->cgroup_context,
+ &pid);
+ if (r < 0)
+ return r;
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ r = unit_watch_pidref(UNIT(s), &pidref, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pidref);
+ return 0;
+}
+
+static void swap_enter_dead(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ unit_log_result(UNIT(s), s->result == SWAP_SUCCESS, swap_result_to_string(s->result));
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_stop);
+ swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD);
+
+ s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
+
+ unit_destroy_runtime_data(UNIT(s), &s->exec_context);
+
+ unit_unref_uid_gid(UNIT(s), true);
+}
+
+static void swap_enter_active(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ swap_set_state(s, SWAP_ACTIVE);
+}
+
+static void swap_enter_dead_or_active(Swap *s, SwapResult f) {
+ assert(s);
+
+ if (s->from_proc_swaps) {
+ swap_enter_active(s, f);
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job)
+ swap_enter_dead_or_active(other, f);
+ } else
+ swap_enter_dead(s, f);
+}
+
+static int state_to_kill_operation(Swap *s, SwapState state) {
+ if (state == SWAP_DEACTIVATING_SIGTERM) {
+ if (unit_has_job_type(UNIT(s), JOB_RESTART))
+ return KILL_RESTART;
+ else
+ return KILL_TERMINATE;
+ }
+
+ return KILL_KILL;
+}
+
+static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
+ int r;
+
+ assert(s);
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ r = unit_kill_context(
+ UNIT(s),
+ &s->kill_context,
+ state_to_kill_operation(s, state),
+ /* main_pid= */ NULL,
+ &s->control_pid,
+ /* main_pid_alien= */ false);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
+ goto fail;
+ }
+
+ if (r > 0) {
+ r = swap_arm_timer(s, /* relative= */ true, s->timeout_usec);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ swap_set_state(s, state);
+ } else if (state == SWAP_DEACTIVATING_SIGTERM && s->kill_context.send_sigkill)
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+ else
+ swap_enter_dead_or_active(s, SWAP_SUCCESS);
+
+ return;
+
+fail:
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_activating(Swap *s) {
+ _cleanup_free_ char *opts = NULL;
+ int r;
+
+ assert(s);
+
+ unit_warn_leftover_processes(UNIT(s), unit_log_leftover_process_start);
+
+ s->control_command_id = SWAP_EXEC_ACTIVATE;
+ s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE;
+
+ if (s->from_fragment) {
+ int priority = 0;
+
+ r = fstab_find_pri(s->parameters_fragment.options, &priority);
+ if (r < 0)
+ log_unit_warning_errno(UNIT(s), r, "Failed to parse swap priority \"%s\", ignoring: %m", s->parameters_fragment.options);
+ else if (r > 0 && s->parameters_fragment.priority_set)
+ log_unit_warning(UNIT(s), "Duplicate swap priority configuration by Priority= and Options= fields.");
+
+ if (r <= 0 && s->parameters_fragment.priority_set) {
+ if (s->parameters_fragment.options)
+ r = asprintf(&opts, "%s,pri=%i", s->parameters_fragment.options, s->parameters_fragment.priority);
+ else
+ r = asprintf(&opts, "pri=%i", s->parameters_fragment.priority);
+ if (r < 0) {
+ r = log_oom();
+ goto fail;
+ }
+ }
+ }
+
+ r = exec_command_set(s->control_command, "/sbin/swapon", "--fixpgsz", NULL);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to initialize swapon command line: %m");
+ goto fail;
+ }
+
+ if (s->parameters_fragment.options || opts) {
+ r = exec_command_append(s->control_command, "-o",
+ opts ?: s->parameters_fragment.options, NULL);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m");
+ goto fail;
+ }
+ }
+
+ r = exec_command_append(s->control_command, s->what, NULL);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapon command line: %m");
+ goto fail;
+ }
+
+ swap_unwatch_control_pid(s);
+
+ r = swap_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapon' task: %m");
+ goto fail;
+ }
+
+ swap_set_state(s, SWAP_ACTIVATING);
+ return;
+
+fail:
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_enter_deactivating(Swap *s) {
+ int r;
+
+ assert(s);
+
+ s->control_command_id = SWAP_EXEC_DEACTIVATE;
+ s->control_command = s->exec_command + SWAP_EXEC_DEACTIVATE;
+
+ r = exec_command_set(s->control_command,
+ "/sbin/swapoff",
+ s->what,
+ NULL);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to prepare swapoff command line: %m");
+ goto fail;
+ }
+
+ swap_unwatch_control_pid(s);
+
+ r = swap_spawn(s, s->control_command, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(s), r, "Failed to spawn 'swapoff' task: %m");
+ goto fail;
+ }
+
+ swap_set_state(s, SWAP_DEACTIVATING);
+ return;
+
+fail:
+ swap_enter_dead_or_active(s, SWAP_FAILURE_RESOURCES);
+}
+
+static void swap_cycle_clear(Swap *s) {
+ assert(s);
+
+ s->result = SWAP_SUCCESS;
+ exec_command_reset_status_array(s->exec_command, _SWAP_EXEC_COMMAND_MAX);
+ UNIT(s)->reset_accounting = true;
+}
+
+static int swap_start(Unit *u) {
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+
+ /* We cannot fulfill this request right now, try again later please! */
+ if (IN_SET(s->state,
+ SWAP_DEACTIVATING,
+ SWAP_DEACTIVATING_SIGTERM,
+ SWAP_DEACTIVATING_SIGKILL,
+ SWAP_CLEANING))
+ return -EAGAIN;
+
+ /* Already on it! */
+ if (s->state == SWAP_ACTIVATING)
+ return 0;
+
+ assert(IN_SET(s->state, SWAP_DEAD, SWAP_FAILED));
+
+ if (detect_container() > 0)
+ return -EPERM;
+
+ /* If there's a job for another swap unit for the same node
+ * running, then let's not dispatch this one for now, and wait
+ * until that other job has finished. */
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (UNIT(other)->job && UNIT(other)->job->state == JOB_RUNNING)
+ return -EAGAIN;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ swap_cycle_clear(s);
+ swap_enter_activating(s);
+ return 1;
+}
+
+static int swap_stop(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ switch (s->state) {
+
+ case SWAP_DEACTIVATING:
+ case SWAP_DEACTIVATING_SIGTERM:
+ case SWAP_DEACTIVATING_SIGKILL:
+ /* Already on it */
+ return 0;
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+ /* There's a control process pending, directly enter kill mode */
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_SUCCESS);
+ return 0;
+
+ case SWAP_ACTIVE:
+ if (detect_container() > 0)
+ return -EPERM;
+
+ swap_enter_deactivating(s);
+ return 1;
+
+ case SWAP_CLEANING:
+ /* If we are currently cleaning, then abort it, brutally. */
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_SUCCESS);
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static int swap_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", swap_state_to_string(s->state));
+ (void) serialize_item(f, "result", swap_result_to_string(s->result));
+ (void) serialize_pidref(f, fds, "control-pid", &s->control_pid);
+
+ if (s->control_command_id >= 0)
+ (void) serialize_item(f, "control-command", swap_exec_command_to_string(s->control_command_id));
+
+ return 0;
+}
+
+static int swap_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ SwapState state;
+
+ state = swap_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ s->deserialized_state = state;
+ } else if (streq(key, "result")) {
+ SwapResult f;
+
+ f = swap_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != SWAP_SUCCESS)
+ s->result = f;
+ } else if (streq(key, "control-pid")) {
+
+ pidref_done(&s->control_pid);
+ (void) deserialize_pidref(fds, value, &s->control_pid);
+
+ } else if (streq(key, "control-command")) {
+ SwapExecCommand id;
+
+ id = swap_exec_command_from_string(value);
+ if (id < 0)
+ log_unit_debug(u, "Failed to parse exec-command value: %s", value);
+ else {
+ s->control_command_id = id;
+ s->control_command = s->exec_command + id;
+ }
+ } else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static void swap_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+ Swap *s = SWAP(u);
+ SwapResult f;
+
+ assert(s);
+ assert(pid >= 0);
+
+ if (pid != s->control_pid.pid)
+ return;
+
+ /* Let's scan /proc/swaps before we process SIGCHLD. For the reasoning see the similar code in
+ * mount.c */
+ (void) swap_process_proc_swaps(u->manager);
+
+ pidref_done(&s->control_pid);
+
+ if (is_clean_exit(code, status, EXIT_CLEAN_COMMAND, NULL))
+ f = SWAP_SUCCESS;
+ else if (code == CLD_EXITED)
+ f = SWAP_FAILURE_EXIT_CODE;
+ else if (code == CLD_KILLED)
+ f = SWAP_FAILURE_SIGNAL;
+ else if (code == CLD_DUMPED)
+ f = SWAP_FAILURE_CORE_DUMP;
+ else
+ assert_not_reached();
+
+ if (s->result == SWAP_SUCCESS)
+ s->result = f;
+
+ if (s->control_command) {
+ exec_status_exit(&s->control_command->exec_status, &s->exec_context, pid, code, status);
+
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+ }
+
+ unit_log_process_exit(
+ u,
+ "Swap process",
+ swap_exec_command_to_string(s->control_command_id),
+ f == SWAP_SUCCESS,
+ code, status);
+
+ switch (s->state) {
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+
+ if (f == SWAP_SUCCESS || s->from_proc_swaps)
+ swap_enter_active(s, f);
+ else
+ swap_enter_dead(s, f);
+ break;
+
+ case SWAP_DEACTIVATING:
+ case SWAP_DEACTIVATING_SIGKILL:
+ case SWAP_DEACTIVATING_SIGTERM:
+
+ swap_enter_dead_or_active(s, f);
+ break;
+
+ case SWAP_CLEANING:
+ if (s->clean_result == SWAP_SUCCESS)
+ s->clean_result = f;
+
+ swap_enter_dead(s, SWAP_SUCCESS);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Notify clients about changed exit status */
+ unit_add_to_dbus_queue(u);
+}
+
+static int swap_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
+ Swap *s = SWAP(userdata);
+
+ assert(s);
+ assert(s->timer_event_source == source);
+
+ switch (s->state) {
+
+ case SWAP_ACTIVATING:
+ case SWAP_ACTIVATING_DONE:
+ log_unit_warning(UNIT(s), "Activation timed out. Stopping.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_DEACTIVATING:
+ log_unit_warning(UNIT(s), "Deactivation timed out. Stopping.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGTERM, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_DEACTIVATING_SIGTERM:
+ if (s->kill_context.send_sigkill) {
+ log_unit_warning(UNIT(s), "Swap process timed out. Killing.");
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, SWAP_FAILURE_TIMEOUT);
+ } else {
+ log_unit_warning(UNIT(s), "Swap process timed out. Skipping SIGKILL. Ignoring.");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+ }
+ break;
+
+ case SWAP_DEACTIVATING_SIGKILL:
+ log_unit_warning(UNIT(s), "Swap process still around after SIGKILL. Ignoring.");
+ swap_enter_dead_or_active(s, SWAP_FAILURE_TIMEOUT);
+ break;
+
+ case SWAP_CLEANING:
+ log_unit_warning(UNIT(s), "Cleaning timed out. killing.");
+
+ if (s->clean_result == SWAP_SUCCESS)
+ s->clean_result = SWAP_FAILURE_TIMEOUT;
+
+ swap_enter_signal(s, SWAP_DEACTIVATING_SIGKILL, 0);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int swap_load_proc_swaps(Manager *m, bool set_flags) {
+ assert(m);
+
+ rewind(m->proc_swaps);
+
+ (void) fscanf(m->proc_swaps, "%*s %*s %*s %*s %*s\n");
+
+ for (unsigned i = 1;; i++) {
+ _cleanup_free_ char *dev = NULL, *d = NULL;
+ int prio = 0, k;
+
+ k = fscanf(m->proc_swaps,
+ "%ms " /* device/file */
+ "%*s " /* type of swap */
+ "%*s " /* swap size */
+ "%*s " /* used */
+ "%i\n", /* priority */
+ &dev, &prio);
+ if (k != 2) {
+ if (k == EOF)
+ break;
+
+ log_warning("Failed to parse /proc/swaps:%u, skipping.", i);
+ continue;
+ }
+
+ ssize_t l = cunescape(dev, UNESCAPE_RELAX, &d);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape device path: %m");
+
+ device_found_node(m, d, DEVICE_FOUND_SWAP, DEVICE_FOUND_SWAP);
+
+ (void) swap_process_new(m, d, prio, set_flags);
+ }
+
+ return 0;
+}
+
+static int swap_process_proc_swaps(Manager *m) {
+ int r;
+
+ assert(m);
+
+ r = swap_load_proc_swaps(m, true);
+ if (r < 0) {
+ /* Reset flags, just in case, for late calls */
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+ Swap *swap = SWAP(u);
+
+ assert(swap);
+
+ swap->is_active = swap->just_activated = false;
+ }
+
+ return 0;
+ }
+
+ manager_dispatch_load_queue(m);
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_SWAP]) {
+ Swap *swap = SWAP(u);
+
+ assert(swap);
+
+ if (!swap->is_active) {
+
+ swap_unset_proc_swaps(swap);
+
+ switch (swap->state) {
+
+ case SWAP_ACTIVE:
+ /* This has just been deactivated */
+ swap_enter_dead(swap, SWAP_SUCCESS);
+ break;
+
+ default:
+ /* Fire again */
+ swap_set_state(swap, swap->state);
+ break;
+ }
+
+ if (swap->what)
+ device_found_node(m, swap->what, DEVICE_NOT_FOUND, DEVICE_FOUND_SWAP);
+
+ } else if (swap->just_activated) {
+
+ /* New swap entry */
+
+ switch (swap->state) {
+
+ case SWAP_DEAD:
+ case SWAP_FAILED:
+ (void) unit_acquire_invocation_id(u);
+ swap_cycle_clear(swap);
+ swap_enter_active(swap, SWAP_SUCCESS);
+ break;
+
+ case SWAP_ACTIVATING:
+ swap_set_state(swap, SWAP_ACTIVATING_DONE);
+ break;
+
+ default:
+ /* Nothing really changed, but let's
+ * issue an notification call
+ * nonetheless, in case somebody is
+ * waiting for this. */
+ swap_set_state(swap, swap->state);
+ break;
+ }
+ }
+
+ /* Reset the flags for later calls */
+ swap->is_active = swap->just_activated = false;
+ }
+
+ return 1;
+}
+
+static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+
+ assert(revents & EPOLLPRI);
+
+ return swap_process_proc_swaps(m);
+}
+
+static Unit *swap_following(Unit *u) {
+ Swap *s = SWAP(u);
+ Swap *first = NULL;
+
+ assert(s);
+
+ /* If the user configured the swap through /etc/fstab or
+ * a device unit, follow that. */
+
+ if (s->from_fragment)
+ return NULL;
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s)
+ if (other->from_fragment)
+ return UNIT(other);
+
+ /* Otherwise, make everybody follow the unit that's named after
+ * the swap device in the kernel */
+
+ if (streq_ptr(s->what, s->devnode))
+ return NULL;
+
+ LIST_FOREACH(same_devnode, other, s->same_devnode_next)
+ if (streq_ptr(other->what, other->devnode))
+ return UNIT(other);
+
+ LIST_FOREACH_BACKWARDS(same_devnode, other, s->same_devnode_prev) {
+ if (streq_ptr(other->what, other->devnode))
+ return UNIT(other);
+
+ first = other;
+ }
+
+ /* Fall back to the first on the list */
+ return UNIT(first);
+}
+
+static int swap_following_set(Unit *u, Set **_set) {
+ Swap *s = SWAP(u);
+ _cleanup_set_free_ Set *set = NULL;
+ int r;
+
+ assert(s);
+ assert(_set);
+
+ if (LIST_JUST_US(same_devnode, s)) {
+ *_set = NULL;
+ return 0;
+ }
+
+ set = set_new(NULL);
+ if (!set)
+ return -ENOMEM;
+
+ LIST_FOREACH_OTHERS(same_devnode, other, s) {
+ r = set_put(set, other);
+ if (r < 0)
+ return r;
+ }
+
+ *_set = TAKE_PTR(set);
+ return 1;
+}
+
+static void swap_shutdown(Manager *m) {
+ assert(m);
+
+ m->swap_event_source = sd_event_source_disable_unref(m->swap_event_source);
+ m->proc_swaps = safe_fclose(m->proc_swaps);
+ m->swaps_by_devnode = hashmap_free(m->swaps_by_devnode);
+}
+
+static void swap_enumerate(Manager *m) {
+ int r;
+
+ assert(m);
+
+ if (!m->proc_swaps) {
+ m->proc_swaps = fopen("/proc/swaps", "re");
+ if (!m->proc_swaps) {
+ if (errno == ENOENT)
+ log_debug_errno(errno, "Not swap enabled, skipping enumeration.");
+ else
+ log_warning_errno(errno, "Failed to open /proc/swaps, ignoring: %m");
+
+ return;
+ }
+
+ r = sd_event_add_io(m->event, &m->swap_event_source, fileno(m->proc_swaps), EPOLLPRI, swap_dispatch_io, m);
+ if (r < 0) {
+ log_error_errno(r, "Failed to watch /proc/swaps: %m");
+ goto fail;
+ }
+
+ /* Dispatch this before we dispatch SIGCHLD, so that
+ * we always get the events from /proc/swaps before
+ * the SIGCHLD of /sbin/swapon. */
+ r = sd_event_source_set_priority(m->swap_event_source, SD_EVENT_PRIORITY_NORMAL-10);
+ if (r < 0) {
+ log_error_errno(r, "Failed to change /proc/swaps priority: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(m->swap_event_source, "swap-proc");
+ }
+
+ r = swap_load_proc_swaps(m, false);
+ if (r < 0)
+ goto fail;
+
+ return;
+
+fail:
+ swap_shutdown(m);
+}
+
+int swap_process_device_new(Manager *m, sd_device *dev) {
+ _cleanup_free_ char *e = NULL;
+ const char *dn;
+ Unit *u;
+ int r;
+
+ assert(m);
+ assert(dev);
+
+ if (sd_device_get_devname(dev, &dn) < 0)
+ return 0;
+
+ r = unit_name_from_path(dn, ".swap", &e);
+ if (r < 0) {
+ log_debug_errno(r, "Cannot convert device name '%s' to unit name, ignoring: %m", dn);
+ return 0;
+ }
+
+ u = manager_get_unit(m, e);
+ if (u)
+ r = swap_set_devnode(SWAP(u), dn);
+
+ FOREACH_DEVICE_DEVLINK(dev, devlink) {
+ _cleanup_free_ char *n = NULL;
+ int q;
+
+ q = unit_name_from_path(devlink, ".swap", &n);
+ if (q == -EINVAL) /* If the name is not convertible to unit name, we can't manage it */
+ continue;
+ if (q < 0)
+ return q;
+
+ u = manager_get_unit(m, n);
+ if (u) {
+ q = swap_set_devnode(SWAP(u), dn);
+ if (q < 0)
+ r = q;
+ }
+ }
+
+ return r;
+}
+
+int swap_process_device_remove(Manager *m, sd_device *dev) {
+ const char *dn;
+ int r;
+ Swap *s;
+
+ r = sd_device_get_devname(dev, &dn);
+ if (r < 0)
+ return 0;
+
+ while ((s = hashmap_get(m->swaps_by_devnode, dn))) {
+ int q;
+
+ q = swap_set_devnode(s, NULL);
+ if (q < 0)
+ r = q;
+ }
+
+ return r;
+}
+
+static void swap_reset_failed(Unit *u) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ if (s->state == SWAP_FAILED)
+ swap_set_state(s, SWAP_DEAD);
+
+ s->result = SWAP_SUCCESS;
+ s->clean_result = SWAP_SUCCESS;
+}
+
+static int swap_get_timeout(Unit *u, usec_t *timeout) {
+ Swap *s = SWAP(u);
+ usec_t t;
+ int r;
+
+ assert(s);
+ assert(u);
+
+ if (!s->timer_event_source)
+ return 0;
+
+ r = sd_event_source_get_time(s->timer_event_source, &t);
+ if (r < 0)
+ return r;
+ if (t == USEC_INFINITY)
+ return 0;
+
+ *timeout = t;
+ return 1;
+}
+
+static bool swap_supported(void) {
+ static int supported = -1;
+
+ /* If swap support is not available in the kernel, or we are
+ * running in a container we don't support swap units, and any
+ * attempts to starting one should fail immediately. */
+
+ if (supported < 0)
+ supported =
+ access("/proc/swaps", F_OK) >= 0 &&
+ detect_container() <= 0;
+
+ return supported;
+}
+
+static PidRef* swap_control_pid(Unit *u) {
+ return &ASSERT_PTR(SWAP(u))->control_pid;
+}
+
+static int swap_clean(Unit *u, ExecCleanMask mask) {
+ _cleanup_strv_free_ char **l = NULL;
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+ assert(mask != 0);
+
+ if (s->state != SWAP_DEAD)
+ return -EBUSY;
+
+ r = exec_context_get_clean_directories(&s->exec_context, u->manager->prefix, mask, &l);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(l))
+ return -EUNATCH;
+
+ swap_unwatch_control_pid(s);
+ s->clean_result = SWAP_SUCCESS;
+ s->control_command = NULL;
+ s->control_command_id = _SWAP_EXEC_COMMAND_INVALID;
+
+ r = swap_arm_timer(s, /* relative= */ true, s->exec_context.timeout_clean_usec);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to install timer: %m");
+ goto fail;
+ }
+
+ r = unit_fork_and_watch_rm_rf(u, l, &s->control_pid);
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to spawn cleaning task: %m");
+ goto fail;
+ }
+
+ swap_set_state(s, SWAP_CLEANING);
+ return 0;
+
+fail:
+ s->clean_result = SWAP_FAILURE_RESOURCES;
+ s->timer_event_source = sd_event_source_disable_unref(s->timer_event_source);
+ return r;
+}
+
+static int swap_can_clean(Unit *u, ExecCleanMask *ret) {
+ Swap *s = SWAP(u);
+
+ assert(s);
+
+ return exec_context_get_clean_mask(&s->exec_context, ret);
+}
+
+static int swap_can_start(Unit *u) {
+ Swap *s = SWAP(u);
+ int r;
+
+ assert(s);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ swap_enter_dead(s, SWAP_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+int swap_get_priority(const Swap *s) {
+ assert(s);
+
+ if (s->from_proc_swaps && s->parameters_proc_swaps.priority_set)
+ return s->parameters_proc_swaps.priority;
+
+ if (s->from_fragment && s->parameters_fragment.priority_set)
+ return s->parameters_fragment.priority;
+
+ return -1;
+}
+
+const char* swap_get_options(const Swap *s) {
+ assert(s);
+
+ if (s->from_fragment)
+ return s->parameters_fragment.options;
+
+ return NULL;
+}
+
+static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = {
+ [SWAP_EXEC_ACTIVATE] = "ExecActivate",
+ [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_exec_command, SwapExecCommand);
+
+static const char* const swap_result_table[_SWAP_RESULT_MAX] = {
+ [SWAP_SUCCESS] = "success",
+ [SWAP_FAILURE_RESOURCES] = "resources",
+ [SWAP_FAILURE_TIMEOUT] = "timeout",
+ [SWAP_FAILURE_EXIT_CODE] = "exit-code",
+ [SWAP_FAILURE_SIGNAL] = "signal",
+ [SWAP_FAILURE_CORE_DUMP] = "core-dump",
+ [SWAP_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(swap_result, SwapResult);
+
+const UnitVTable swap_vtable = {
+ .object_size = sizeof(Swap),
+ .exec_context_offset = offsetof(Swap, exec_context),
+ .cgroup_context_offset = offsetof(Swap, cgroup_context),
+ .kill_context_offset = offsetof(Swap, kill_context),
+ .exec_runtime_offset = offsetof(Swap, exec_runtime),
+
+ .sections =
+ "Unit\0"
+ "Swap\0"
+ "Install\0",
+ .private_section = "Swap",
+
+ .can_fail = true,
+
+ .init = swap_init,
+ .load = swap_load,
+ .done = swap_done,
+
+ .coldplug = swap_coldplug,
+
+ .dump = swap_dump,
+
+ .start = swap_start,
+ .stop = swap_stop,
+
+ .clean = swap_clean,
+ .can_clean = swap_can_clean,
+
+ .get_timeout = swap_get_timeout,
+
+ .serialize = swap_serialize,
+ .deserialize_item = swap_deserialize_item,
+
+ .active_state = swap_active_state,
+ .sub_state_to_string = swap_sub_state_to_string,
+
+ .will_restart = unit_will_restart_default,
+
+ .may_gc = swap_may_gc,
+ .is_extrinsic = swap_is_extrinsic,
+
+ .sigchld_event = swap_sigchld_event,
+
+ .reset_failed = swap_reset_failed,
+
+ .control_pid = swap_control_pid,
+
+ .bus_set_property = bus_swap_set_property,
+ .bus_commit_properties = bus_swap_commit_properties,
+
+ .following = swap_following,
+ .following_set = swap_following_set,
+
+ .enumerate = swap_enumerate,
+ .shutdown = swap_shutdown,
+ .supported = swap_supported,
+
+ .status_message_formats = {
+ .starting_stopping = {
+ [0] = "Activating swap %s...",
+ [1] = "Deactivating swap %s...",
+ },
+ .finished_start_job = {
+ [JOB_DONE] = "Activated swap %s.",
+ [JOB_FAILED] = "Failed to activate swap %s.",
+ [JOB_TIMEOUT] = "Timed out activating swap %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Deactivated swap %s.",
+ [JOB_FAILED] = "Failed deactivating swap %s.",
+ [JOB_TIMEOUT] = "Timed out deactivating swap %s.",
+ },
+ },
+
+ .can_start = swap_can_start,
+
+ .notify_plymouth = true,
+};
diff --git a/src/core/swap.h b/src/core/swap.h
new file mode 100644
index 0000000..ef20f0f
--- /dev/null
+++ b/src/core/swap.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2010 Maarten Lankhorst
+***/
+
+#include "sd-device.h"
+
+#include "pidref.h"
+#include "unit.h"
+
+typedef struct Swap Swap;
+
+typedef enum SwapExecCommand {
+ SWAP_EXEC_ACTIVATE,
+ SWAP_EXEC_DEACTIVATE,
+ _SWAP_EXEC_COMMAND_MAX,
+ _SWAP_EXEC_COMMAND_INVALID = -EINVAL,
+} SwapExecCommand;
+
+typedef enum SwapResult {
+ SWAP_SUCCESS,
+ SWAP_FAILURE_RESOURCES,
+ SWAP_FAILURE_TIMEOUT,
+ SWAP_FAILURE_EXIT_CODE,
+ SWAP_FAILURE_SIGNAL,
+ SWAP_FAILURE_CORE_DUMP,
+ SWAP_FAILURE_START_LIMIT_HIT,
+ _SWAP_RESULT_MAX,
+ _SWAP_RESULT_INVALID = -EINVAL,
+} SwapResult;
+
+typedef struct SwapParameters {
+ char *what;
+ char *options;
+ int priority;
+ bool priority_set;
+} SwapParameters;
+
+struct Swap {
+ Unit meta;
+
+ char *what;
+
+ /* If the device has already shown up, this is the device
+ * node, which might be different from what, due to
+ * symlinks */
+ char *devnode;
+
+ SwapParameters parameters_proc_swaps;
+ SwapParameters parameters_fragment;
+
+ bool from_proc_swaps:1;
+ bool from_fragment:1;
+
+ /* Used while looking for swaps that vanished or got added
+ * from/to /proc/swaps */
+ bool is_active:1;
+ bool just_activated:1;
+
+ SwapResult result;
+ SwapResult clean_result;
+
+ usec_t timeout_usec;
+
+ ExecCommand exec_command[_SWAP_EXEC_COMMAND_MAX];
+ ExecContext exec_context;
+ KillContext kill_context;
+ CGroupContext cgroup_context;
+
+ ExecRuntime *exec_runtime;
+
+ SwapState state, deserialized_state;
+
+ ExecCommand* control_command;
+ SwapExecCommand control_command_id;
+ PidRef control_pid;
+
+ sd_event_source *timer_event_source;
+
+ /* In order to be able to distinguish dependencies on
+ different device nodes we might end up creating multiple
+ devices for the same swap. We chain them up here. */
+
+ LIST_FIELDS(struct Swap, same_devnode);
+};
+
+extern const UnitVTable swap_vtable;
+
+int swap_process_device_new(Manager *m, sd_device *dev);
+int swap_process_device_remove(Manager *m, sd_device *dev);
+
+int swap_get_priority(const Swap *s);
+const char* swap_get_options(const Swap *s);
+
+const char* swap_exec_command_to_string(SwapExecCommand i) _const_;
+SwapExecCommand swap_exec_command_from_string(const char *s) _pure_;
+
+const char* swap_result_to_string(SwapResult i) _const_;
+SwapResult swap_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(SWAP, Swap);
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
new file mode 100644
index 0000000..05eb681
--- /dev/null
+++ b/src/core/system.conf.in
@@ -0,0 +1,83 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# /etc/systemd/system.conf.d/ directory. The latter is generally recommended.
+# Defaults can be restored by simply deleting the main configuration file and
+# all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/system.conf' to display the full config.
+#
+# See systemd-system.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=journal-or-kmsg
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#DumpCore=yes
+#ShowStatus=yes
+#CrashChangeVT=no
+#CrashShell=no
+#CrashReboot=no
+#CtrlAltDelBurstAction=reboot-force
+#CPUAffinity=
+#NUMAPolicy=default
+#NUMAMask=
+#RuntimeWatchdogSec=off
+#RuntimeWatchdogPreSec=off
+#RuntimeWatchdogPreGovernor=
+#RebootWatchdogSec=10min
+#KExecWatchdogSec=off
+#WatchdogDevice=
+#CapabilityBoundingSet=
+#NoNewPrivileges=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=journal
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultTimeoutStopSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultTimeoutAbortSec=
+#DefaultDeviceTimeoutSec={{DEFAULT_TIMEOUT_SEC}}s
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultCPUAccounting=yes
+#DefaultIOAccounting=no
+#DefaultIPAccounting=no
+#DefaultMemoryAccounting={{ 'yes' if MEMORY_ACCOUNTING_DEFAULT else 'no' }}
+#DefaultTasksAccounting=yes
+#DefaultTasksMax=15%
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=1024:{{HIGH_RLIMIT_NOFILE}}
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=8M
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultMemoryPressureThresholdSec=200ms
+#DefaultMemoryPressureWatch=auto
+#DefaultOOMPolicy=stop
+#DefaultSmackProcessLabel=
+#ReloadLimitIntervalSec=
+#ReloadLimitBurst=
diff --git a/src/core/systemd.pc.in b/src/core/systemd.pc.in
new file mode 100644
index 0000000..f3b85b0
--- /dev/null
+++ b/src/core/systemd.pc.in
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+# Names with prefixes are preferred, and the run-together names should be
+# considered deprecated (though there is no plan to remove them). New names
+# shall have underscores.
+
+# root_prefix and rootprefix are deprecated since we dropped support for split-usr
+# however we used to install units in root_prefix and a lot of downstream software
+# overrode this variable in their build system to support installing units elsewhere.
+# To stop those builds from silently breaking we keep root_prefix around but have
+# it as an alias for prefix
+root_prefix={{PREFIX_NOSLASH}}
+rootprefix=${root_prefix}
+prefix=${rootprefix}
+sysconf_dir={{SYSCONF_DIR}}
+sysconfdir=${sysconf_dir}
+
+systemd_util_dir=${prefix}/lib/systemd
+systemdutildir=${systemd_util_dir}
+
+systemd_system_unit_dir=${prefix}/lib/systemd/system
+systemdsystemunitdir=${systemd_system_unit_dir}
+
+systemd_system_preset_dir=${prefix}/lib/systemd/system-preset
+systemdsystempresetdir=${systemd_system_preset_dir}
+
+systemd_user_unit_dir=${prefix}/lib/systemd/user
+systemduserunitdir=${systemd_user_unit_dir}
+
+systemd_user_preset_dir=${prefix}/lib/systemd/user-preset
+systemduserpresetdir=${systemd_user_preset_dir}
+
+systemd_system_conf_dir=${sysconfdir}/systemd/system
+systemdsystemconfdir=${systemd_system_conf_dir}
+
+systemd_user_conf_dir=${sysconfdir}/systemd/user
+systemduserconfdir=${systemd_user_conf_dir}
+
+systemd_system_unit_path=${systemd_system_conf_dir}:/etc/systemd/system:/run/systemd/system:/usr/local/lib/systemd/system:${systemd_system_unit_dir}:/usr/lib/systemd/system:/lib/systemd/system
+systemdsystemunitpath=${systemd_system_unit_path}
+
+systemd_user_unit_path=${systemd_user_conf_dir}:/etc/systemd/user:/run/systemd/user:/usr/local/lib/systemd/user:/usr/local/share/systemd/user:${systemd_user_unit_dir}:/usr/lib/systemd/user:/usr/share/systemd/user
+systemduserunitpath=${systemd_user_unit_path}
+
+systemd_system_generator_dir=${prefix}/lib/systemd/system-generators
+systemdsystemgeneratordir=${systemd_system_generator_dir}
+
+systemd_user_generator_dir=${prefix}/lib/systemd/user-generators
+systemdusergeneratordir=${systemd_user_generator_dir}
+
+systemd_system_generator_path=/run/systemd/system-generators:/etc/systemd/system-generators:/usr/local/lib/systemd/system-generators:${systemd_system_generator_dir}
+systemdsystemgeneratorpath=${systemd_system_generator_path}
+
+systemd_user_generator_path=/run/systemd/user-generators:/etc/systemd/user-generators:/usr/local/lib/systemd/user-generators:${systemd_user_generator_dir}
+systemdusergeneratorpath=${systemd_user_generator_path}
+
+systemd_sleep_dir=${prefix}/lib/systemd/system-sleep
+systemdsleepdir=${systemd_sleep_dir}
+
+systemd_shutdown_dir=${prefix}/lib/systemd/system-shutdown
+systemdshutdowndir=${systemd_shutdown_dir}
+
+tmpfiles_dir=${prefix}/lib/tmpfiles.d
+tmpfilesdir=${tmpfiles_dir}
+
+user_tmpfiles_dir=${prefix}/share/user-tmpfiles.d
+
+sysusers_dir=${prefix}/lib/sysusers.d
+sysusersdir=${sysusers_dir}
+
+sysctl_dir=${prefix}/lib/sysctl.d
+sysctldir=${sysctl_dir}
+
+binfmt_dir=${prefix}/lib/binfmt.d
+binfmtdir=${binfmt_dir}
+
+modules_load_dir=${prefix}/lib/modules-load.d
+modulesloaddir=${modules_load_dir}
+
+catalog_dir=${prefix}/lib/systemd/catalog
+catalogdir=${catalog_dir}
+
+system_uid_max={{SYSTEM_UID_MAX}}
+systemuidmax=${system_uid_max}
+system_gid_max={{SYSTEM_GID_MAX}}
+systemgidmax=${system_gid_max}
+
+dynamic_uid_min={{DYNAMIC_UID_MIN}}
+dynamicuidmin=${dynamic_uid_min}
+dynamic_uid_max={{DYNAMIC_UID_MAX}}
+dynamicuidmax=${dynamic_uid_max}
+
+container_uid_base_min={{CONTAINER_UID_BASE_MIN}}
+containeruidbasemin=${container_uid_base_min}
+container_uid_base_max={{CONTAINER_UID_BASE_MAX}}
+containeruidbasemax=${container_uid_base_max}
+
+Name: systemd
+Description: systemd System and Service Manager
+URL: {{PROJECT_URL}}
+Version: {{PROJECT_VERSION}}
diff --git a/src/core/target.c b/src/core/target.c
new file mode 100644
index 0000000..8f2a331
--- /dev/null
+++ b/src/core/target.c
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dbus-target.h"
+#include "dbus-unit.h"
+#include "log.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-util.h"
+#include "target.h"
+#include "unit-name.h"
+#include "unit.h"
+
+static const UnitActiveState state_translation_table[_TARGET_STATE_MAX] = {
+ [TARGET_DEAD] = UNIT_INACTIVE,
+ [TARGET_ACTIVE] = UNIT_ACTIVE
+};
+
+static void target_set_state(Target *t, TargetState state) {
+ TargetState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != old_state)
+ log_debug("%s changed %s -> %s",
+ UNIT(t)->id,
+ target_state_to_string(old_state),
+ target_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static int target_add_default_dependencies(Target *t) {
+ _cleanup_free_ Unit **others = NULL;
+ int r, n_others;
+
+ assert(t);
+
+ if (!UNIT(t)->default_dependencies)
+ return 0;
+
+ /* Imply ordering for requirement dependencies on target units. Note that when the user created a
+ * contradicting ordering manually we won't add anything in here to make sure we don't create a
+ * loop.
+ *
+ * Note that quite likely iterating through these dependencies will add new dependencies, which
+ * conflicts with the hashmap-based iteration logic. Hence, instead of iterating through the
+ * dependencies and acting on them as we go, first take an "atomic snapshot" of sorts and iterate
+ * through that. */
+
+ n_others = unit_get_dependency_array(UNIT(t), UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE, &others);
+ if (n_others < 0)
+ return n_others;
+
+ for (int i = 0; i < n_others; i++) {
+ r = unit_add_default_target_dependency(others[i], UNIT(t));
+ if (r < 0)
+ return r;
+ }
+
+ if (unit_has_name(UNIT(t), SPECIAL_SHUTDOWN_TARGET))
+ return 0;
+
+ /* Make sure targets are unloaded on shutdown */
+ return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int target_load(Unit *u) {
+ Target *t = TARGET(u);
+ int r;
+
+ assert(t);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ return target_add_default_dependencies(t);
+}
+
+static int target_coldplug(Unit *u) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(t->state == TARGET_DEAD);
+
+ if (t->deserialized_state != t->state)
+ target_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void target_dump(Unit *u, FILE *f, const char *prefix) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(f);
+
+ fprintf(f,
+ "%sTarget State: %s\n",
+ prefix, target_state_to_string(t->state));
+}
+
+static int target_start(Unit *u) {
+ Target *t = TARGET(u);
+ int r;
+
+ assert(t);
+ assert(t->state == TARGET_DEAD);
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ target_set_state(t, TARGET_ACTIVE);
+ return 1;
+}
+
+static int target_stop(Unit *u) {
+ Target *t = TARGET(u);
+
+ assert(t);
+ assert(t->state == TARGET_ACTIVE);
+
+ target_set_state(t, TARGET_DEAD);
+ return 1;
+}
+
+static int target_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Target *s = TARGET(u);
+
+ assert(s);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", target_state_to_string(s->state));
+ return 0;
+}
+
+static int target_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Target *s = TARGET(u);
+
+ assert(s);
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ TargetState state;
+
+ state = target_state_from_string(value);
+ if (state < 0)
+ log_debug("Failed to parse state value %s", value);
+ else
+ s->deserialized_state = state;
+
+ } else
+ log_debug("Unknown serialization key '%s'", key);
+
+ return 0;
+}
+
+static UnitActiveState target_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[TARGET(u)->state];
+}
+
+static const char *target_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return target_state_to_string(TARGET(u)->state);
+}
+
+const UnitVTable target_vtable = {
+ .object_size = sizeof(Target),
+
+ .sections =
+ "Unit\0"
+ "Target\0"
+ "Install\0",
+
+ .can_fail = true,
+
+ .load = target_load,
+ .coldplug = target_coldplug,
+
+ .dump = target_dump,
+
+ .start = target_start,
+ .stop = target_stop,
+
+ .serialize = target_serialize,
+ .deserialize_item = target_deserialize_item,
+
+ .active_state = target_active_state,
+ .sub_state_to_string = target_sub_state_to_string,
+
+ .status_message_formats = {
+ .finished_start_job = {
+ [JOB_DONE] = "Reached target %s.",
+ },
+ .finished_stop_job = {
+ [JOB_DONE] = "Stopped target %s.",
+ },
+ },
+};
diff --git a/src/core/target.h b/src/core/target.h
new file mode 100644
index 0000000..bb909d6
--- /dev/null
+++ b/src/core/target.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "unit.h"
+
+typedef struct Target Target;
+
+struct Target {
+ Unit meta;
+
+ TargetState state, deserialized_state;
+};
+
+extern const UnitVTable target_vtable;
+
+DEFINE_CAST(TARGET, Target);
diff --git a/src/core/timer.c b/src/core/timer.c
new file mode 100644
index 0000000..3c41a25
--- /dev/null
+++ b/src/core/timer.c
@@ -0,0 +1,1106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "dbus-timer.h"
+#include "dbus-unit.h"
+#include "fs-util.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "serialize.h"
+#include "special.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "timer.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+
+static const UnitActiveState state_translation_table[_TIMER_STATE_MAX] = {
+ [TIMER_DEAD] = UNIT_INACTIVE,
+ [TIMER_WAITING] = UNIT_ACTIVE,
+ [TIMER_RUNNING] = UNIT_ACTIVE,
+ [TIMER_ELAPSED] = UNIT_ACTIVE,
+ [TIMER_FAILED] = UNIT_FAILED
+};
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata);
+
+static void timer_init(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+ t->next_elapse_realtime = USEC_INFINITY;
+ t->accuracy_usec = u->manager->defaults.timer_accuracy_usec;
+ t->remain_after_elapse = true;
+}
+
+void timer_free_values(Timer *t) {
+ TimerValue *v;
+
+ assert(t);
+
+ while ((v = LIST_POP(value, t->values))) {
+ calendar_spec_free(v->calendar_spec);
+ free(v);
+ }
+}
+
+static void timer_done(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+
+ timer_free_values(t);
+
+ t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+ t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+
+ t->stamp_path = mfree(t->stamp_path);
+}
+
+static int timer_verify(Timer *t) {
+ assert(t);
+ assert(UNIT(t)->load_state == UNIT_LOADED);
+
+ if (!t->values && !t->on_clock_change && !t->on_timezone_change)
+ return log_unit_error_errno(UNIT(t), SYNTHETIC_ERRNO(ENOEXEC), "Timer unit lacks value setting. Refusing.");
+
+ return 0;
+}
+
+static int timer_add_default_dependencies(Timer *t) {
+ int r;
+
+ assert(t);
+
+ if (!UNIT(t)->default_dependencies)
+ return 0;
+
+ r = unit_add_dependency_by_name(UNIT(t), UNIT_BEFORE, SPECIAL_TIMERS_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+ r = unit_add_two_dependencies_by_name(UNIT(t), UNIT_AFTER, UNIT_REQUIRES, SPECIAL_SYSINIT_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+
+ LIST_FOREACH(value, v, t->values) {
+ if (v->base != TIMER_CALENDAR)
+ continue;
+
+ FOREACH_STRING(target, SPECIAL_TIME_SYNC_TARGET, SPECIAL_TIME_SET_TARGET) {
+ r = unit_add_dependency_by_name(UNIT(t), UNIT_AFTER, target, true, UNIT_DEPENDENCY_DEFAULT);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+ }
+ }
+
+ return unit_add_two_dependencies_by_name(UNIT(t), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_SHUTDOWN_TARGET, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int timer_add_trigger_dependencies(Timer *t) {
+ Unit *x;
+ int r;
+
+ assert(t);
+
+ if (UNIT_TRIGGER(UNIT(t)))
+ return 0;
+
+ r = unit_load_related_unit(UNIT(t), ".service", &x);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(UNIT(t), UNIT_BEFORE, UNIT_TRIGGERS, x, true, UNIT_DEPENDENCY_IMPLICIT);
+}
+
+static int timer_setup_persistent(Timer *t) {
+ _cleanup_free_ char *stamp_path = NULL;
+ int r;
+
+ assert(t);
+
+ if (!t->persistent)
+ return 0;
+
+ if (MANAGER_IS_SYSTEM(UNIT(t)->manager)) {
+
+ r = unit_require_mounts_for(UNIT(t), "/var/lib/systemd/timers", UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ stamp_path = strjoin("/var/lib/systemd/timers/stamp-", UNIT(t)->id);
+ } else {
+ const char *e;
+
+ e = getenv("XDG_DATA_HOME");
+ if (e)
+ stamp_path = strjoin(e, "/systemd/timers/stamp-", UNIT(t)->id);
+ else {
+
+ _cleanup_free_ char *h = NULL;
+
+ r = get_home_dir(&h);
+ if (r < 0)
+ return log_unit_error_errno(UNIT(t), r, "Failed to determine home directory: %m");
+
+ stamp_path = strjoin(h, "/.local/share/systemd/timers/stamp-", UNIT(t)->id);
+ }
+ }
+
+ if (!stamp_path)
+ return log_oom();
+
+ return free_and_replace(t->stamp_path, stamp_path);
+}
+
+static uint64_t timer_get_fixed_delay_hash(Timer *t) {
+ static const uint8_t hash_key[] = {
+ 0x51, 0x0a, 0xdb, 0x76, 0x29, 0x51, 0x42, 0xc2,
+ 0x80, 0x35, 0xea, 0xe6, 0x8e, 0x3a, 0x37, 0xbd
+ };
+
+ struct siphash state;
+ sd_id128_t machine_id;
+ uid_t uid;
+ int r;
+
+ assert(t);
+
+ uid = getuid();
+ r = sd_id128_get_machine(&machine_id);
+ if (r < 0) {
+ log_unit_debug_errno(UNIT(t), r,
+ "Failed to get machine ID for the fixed delay calculation, proceeding with 0: %m");
+ machine_id = SD_ID128_NULL;
+ }
+
+ siphash24_init(&state, hash_key);
+ siphash24_compress(&machine_id, sizeof(sd_id128_t), &state);
+ siphash24_compress_boolean(MANAGER_IS_SYSTEM(UNIT(t)->manager), &state);
+ siphash24_compress(&uid, sizeof(uid_t), &state);
+ siphash24_compress_string(UNIT(t)->id, &state);
+
+ return siphash24_finalize(&state);
+}
+
+static int timer_load(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(u);
+ assert(u->load_state == UNIT_STUB);
+
+ r = unit_load_fragment_and_dropin(u, true);
+ if (r < 0)
+ return r;
+
+ if (u->load_state != UNIT_LOADED)
+ return 0;
+
+ /* This is a new unit? Then let's add in some extras */
+ r = timer_add_trigger_dependencies(t);
+ if (r < 0)
+ return r;
+
+ r = timer_setup_persistent(t);
+ if (r < 0)
+ return r;
+
+ r = timer_add_default_dependencies(t);
+ if (r < 0)
+ return r;
+
+ return timer_verify(t);
+}
+
+static void timer_dump(Unit *u, FILE *f, const char *prefix) {
+ Timer *t = TIMER(u);
+ Unit *trigger;
+
+ trigger = UNIT_TRIGGER(u);
+
+ fprintf(f,
+ "%sTimer State: %s\n"
+ "%sResult: %s\n"
+ "%sUnit: %s\n"
+ "%sPersistent: %s\n"
+ "%sWakeSystem: %s\n"
+ "%sAccuracy: %s\n"
+ "%sRemainAfterElapse: %s\n"
+ "%sFixedRandomDelay: %s\n"
+ "%sOnClockChange: %s\n"
+ "%sOnTimeZoneChange: %s\n",
+ prefix, timer_state_to_string(t->state),
+ prefix, timer_result_to_string(t->result),
+ prefix, trigger ? trigger->id : "n/a",
+ prefix, yes_no(t->persistent),
+ prefix, yes_no(t->wake_system),
+ prefix, FORMAT_TIMESPAN(t->accuracy_usec, 1),
+ prefix, yes_no(t->remain_after_elapse),
+ prefix, yes_no(t->fixed_random_delay),
+ prefix, yes_no(t->on_clock_change),
+ prefix, yes_no(t->on_timezone_change));
+
+ LIST_FOREACH(value, v, t->values)
+ if (v->base == TIMER_CALENDAR) {
+ _cleanup_free_ char *p = NULL;
+
+ (void) calendar_spec_to_string(v->calendar_spec, &p);
+
+ fprintf(f,
+ "%s%s: %s\n",
+ prefix,
+ timer_base_to_string(v->base),
+ strna(p));
+ } else
+ fprintf(f,
+ "%s%s: %s\n",
+ prefix,
+ timer_base_to_string(v->base),
+ FORMAT_TIMESPAN(v->value, 0));
+}
+
+static void timer_set_state(Timer *t, TimerState state) {
+ TimerState old_state;
+ assert(t);
+
+ if (t->state != state)
+ bus_unit_send_pending_change_signal(UNIT(t), false);
+
+ old_state = t->state;
+ t->state = state;
+
+ if (state != TIMER_WAITING) {
+ t->monotonic_event_source = sd_event_source_disable_unref(t->monotonic_event_source);
+ t->realtime_event_source = sd_event_source_disable_unref(t->realtime_event_source);
+ t->next_elapse_monotonic_or_boottime = USEC_INFINITY;
+ t->next_elapse_realtime = USEC_INFINITY;
+ }
+
+ if (state != old_state)
+ log_unit_debug(UNIT(t), "Changed %s -> %s", timer_state_to_string(old_state), timer_state_to_string(state));
+
+ unit_notify(UNIT(t), state_translation_table[old_state], state_translation_table[state], /* reload_success = */ true);
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change);
+
+static int timer_coldplug(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(t->state == TIMER_DEAD);
+
+ if (t->deserialized_state == t->state)
+ return 0;
+
+ if (t->deserialized_state == TIMER_WAITING)
+ timer_enter_waiting(t, false);
+ else
+ timer_set_state(t, t->deserialized_state);
+
+ return 0;
+}
+
+static void timer_enter_dead(Timer *t, TimerResult f) {
+ assert(t);
+
+ if (t->result == TIMER_SUCCESS)
+ t->result = f;
+
+ unit_log_result(UNIT(t), t->result == TIMER_SUCCESS, timer_result_to_string(t->result));
+ timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD);
+}
+
+static void timer_enter_elapsed(Timer *t, bool leave_around) {
+ assert(t);
+
+ /* If a unit is marked with RemainAfterElapse=yes we leave it
+ * around even after it elapsed once, so that starting it
+ * later again does not necessarily mean immediate
+ * retriggering. We unconditionally leave units with
+ * TIMER_UNIT_ACTIVE or TIMER_UNIT_INACTIVE triggers around,
+ * since they might be restarted automatically at any time
+ * later on. */
+
+ if (t->remain_after_elapse || leave_around)
+ timer_set_state(t, TIMER_ELAPSED);
+ else
+ timer_enter_dead(t, TIMER_SUCCESS);
+}
+
+static void add_random(Timer *t, usec_t *v) {
+ usec_t add;
+
+ assert(t);
+ assert(v);
+
+ if (t->random_usec == 0)
+ return;
+ if (*v == USEC_INFINITY)
+ return;
+
+ add = (t->fixed_random_delay ? timer_get_fixed_delay_hash(t) : random_u64()) % t->random_usec;
+
+ if (*v + add < *v) /* overflow */
+ *v = (usec_t) -2; /* Highest possible value, that is not USEC_INFINITY */
+ else
+ *v += add;
+
+ log_unit_debug(UNIT(t), "Adding %s random time.", FORMAT_TIMESPAN(add, 0));
+}
+
+static void timer_enter_waiting(Timer *t, bool time_change) {
+ bool found_monotonic = false, found_realtime = false;
+ bool leave_around = false;
+ triple_timestamp ts;
+ Unit *trigger;
+ int r;
+
+ assert(t);
+
+ trigger = UNIT_TRIGGER(UNIT(t));
+ if (!trigger) {
+ log_unit_error(UNIT(t), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ triple_timestamp_now(&ts);
+ t->next_elapse_monotonic_or_boottime = t->next_elapse_realtime = 0;
+
+ LIST_FOREACH(value, v, t->values) {
+ if (v->disabled)
+ continue;
+
+ if (v->base == TIMER_CALENDAR) {
+ usec_t b, rebased;
+
+ /* If we know the last time this was
+ * triggered, schedule the job based relative
+ * to that. If we don't, just start from
+ * the activation time. */
+
+ if (dual_timestamp_is_set(&t->last_trigger))
+ b = t->last_trigger.realtime;
+ else if (dual_timestamp_is_set(&UNIT(t)->inactive_exit_timestamp))
+ b = UNIT(t)->inactive_exit_timestamp.realtime;
+ else
+ b = ts.realtime;
+
+ r = calendar_spec_next_usec(v->calendar_spec, b, &v->next_elapse);
+ if (r < 0)
+ continue;
+
+ /* To make the delay due to RandomizedDelaySec= work even at boot, if the scheduled
+ * time has already passed, set the time when systemd first started as the scheduled
+ * time. Note that we base this on the monotonic timestamp of the boot, not the
+ * realtime one, since the wallclock might have been off during boot. */
+ rebased = map_clock_usec(UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic,
+ CLOCK_MONOTONIC, CLOCK_REALTIME);
+ if (v->next_elapse < rebased)
+ v->next_elapse = rebased;
+
+ if (!found_realtime)
+ t->next_elapse_realtime = v->next_elapse;
+ else
+ t->next_elapse_realtime = MIN(t->next_elapse_realtime, v->next_elapse);
+
+ found_realtime = true;
+
+ } else {
+ usec_t base;
+
+ switch (v->base) {
+
+ case TIMER_ACTIVE:
+ if (state_translation_table[t->state] == UNIT_ACTIVE)
+ base = UNIT(t)->inactive_exit_timestamp.monotonic;
+ else
+ base = ts.monotonic;
+ break;
+
+ case TIMER_BOOT:
+ if (detect_container() <= 0) {
+ /* CLOCK_MONOTONIC equals the uptime on Linux */
+ base = 0;
+ break;
+ }
+ /* In a container we don't want to include the time the host
+ * was already up when the container started, so count from
+ * our own startup. */
+ _fallthrough_;
+ case TIMER_STARTUP:
+ base = UNIT(t)->manager->timestamps[MANAGER_TIMESTAMP_USERSPACE].monotonic;
+ break;
+
+ case TIMER_UNIT_ACTIVE:
+ leave_around = true;
+ base = MAX(trigger->inactive_exit_timestamp.monotonic, t->last_trigger.monotonic);
+ if (base <= 0)
+ continue;
+ break;
+
+ case TIMER_UNIT_INACTIVE:
+ leave_around = true;
+ base = MAX(trigger->inactive_enter_timestamp.monotonic, t->last_trigger.monotonic);
+ if (base <= 0)
+ continue;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ v->next_elapse = usec_add(usec_shift_clock(base, CLOCK_MONOTONIC, TIMER_MONOTONIC_CLOCK(t)), v->value);
+
+ if (dual_timestamp_is_set(&t->last_trigger) &&
+ !time_change &&
+ v->next_elapse < triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)) &&
+ IN_SET(v->base, TIMER_ACTIVE, TIMER_BOOT, TIMER_STARTUP)) {
+ /* This is a one time trigger, disable it now */
+ v->disabled = true;
+ continue;
+ }
+
+ if (!found_monotonic)
+ t->next_elapse_monotonic_or_boottime = v->next_elapse;
+ else
+ t->next_elapse_monotonic_or_boottime = MIN(t->next_elapse_monotonic_or_boottime, v->next_elapse);
+
+ found_monotonic = true;
+ }
+ }
+
+ if (!found_monotonic && !found_realtime && !t->on_timezone_change && !t->on_clock_change) {
+ log_unit_debug(UNIT(t), "Timer is elapsed.");
+ timer_enter_elapsed(t, leave_around);
+ return;
+ }
+
+ if (found_monotonic) {
+ usec_t left;
+
+ add_random(t, &t->next_elapse_monotonic_or_boottime);
+
+ left = usec_sub_unsigned(t->next_elapse_monotonic_or_boottime, triple_timestamp_by_clock(&ts, TIMER_MONOTONIC_CLOCK(t)));
+ log_unit_debug(UNIT(t), "Monotonic timer elapses in %s.", FORMAT_TIMESPAN(left, 0));
+
+ if (t->monotonic_event_source) {
+ r = sd_event_source_set_time(t->monotonic_event_source, t->next_elapse_monotonic_or_boottime);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to reschedule monotonic event source: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_ONESHOT);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to enable monotonic event source: %m");
+ goto fail;
+ }
+ } else {
+
+ r = sd_event_add_time(
+ UNIT(t)->manager->event,
+ &t->monotonic_event_source,
+ t->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC,
+ t->next_elapse_monotonic_or_boottime, t->accuracy_usec,
+ timer_dispatch, t);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to add monotonic event source: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(t->monotonic_event_source, "timer-monotonic");
+ }
+
+ } else if (t->monotonic_event_source) {
+
+ r = sd_event_source_set_enabled(t->monotonic_event_source, SD_EVENT_OFF);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to disable monotonic event source: %m");
+ goto fail;
+ }
+ }
+
+ if (found_realtime) {
+ add_random(t, &t->next_elapse_realtime);
+
+ log_unit_debug(UNIT(t), "Realtime timer elapses at %s.", FORMAT_TIMESTAMP(t->next_elapse_realtime));
+
+ if (t->realtime_event_source) {
+ r = sd_event_source_set_time(t->realtime_event_source, t->next_elapse_realtime);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to reschedule realtime event source: %m");
+ goto fail;
+ }
+
+ r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_ONESHOT);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to enable realtime event source: %m");
+ goto fail;
+ }
+ } else {
+ r = sd_event_add_time(
+ UNIT(t)->manager->event,
+ &t->realtime_event_source,
+ t->wake_system ? CLOCK_REALTIME_ALARM : CLOCK_REALTIME,
+ t->next_elapse_realtime, t->accuracy_usec,
+ timer_dispatch, t);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to add realtime event source: %m");
+ goto fail;
+ }
+
+ (void) sd_event_source_set_description(t->realtime_event_source, "timer-realtime");
+ }
+
+ } else if (t->realtime_event_source) {
+
+ r = sd_event_source_set_enabled(t->realtime_event_source, SD_EVENT_OFF);
+ if (r < 0) {
+ log_unit_warning_errno(UNIT(t), r, "Failed to disable realtime event source: %m");
+ goto fail;
+ }
+ }
+
+ timer_set_state(t, TIMER_WAITING);
+ return;
+
+fail:
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static void timer_enter_running(Timer *t) {
+ _cleanup_(activation_details_unrefp) ActivationDetails *details = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ Unit *trigger;
+ Job *job;
+ int r;
+
+ assert(t);
+
+ /* Don't start job if we are supposed to go down */
+ if (unit_stop_pending(UNIT(t)))
+ return;
+
+ trigger = UNIT_TRIGGER(UNIT(t));
+ if (!trigger) {
+ log_unit_error(UNIT(t), "Unit to trigger vanished.");
+ goto fail;
+ }
+
+ details = activation_details_new(UNIT(t));
+ if (!details) {
+ log_oom();
+ goto fail;
+ }
+
+ r = manager_add_job(UNIT(t)->manager, JOB_START, trigger, JOB_REPLACE, NULL, &error, &job);
+ if (r < 0) {
+ log_unit_warning(UNIT(t), "Failed to queue unit startup job: %s", bus_error_message(&error, r));
+ goto fail;
+ }
+
+ dual_timestamp_now(&t->last_trigger);
+ ACTIVATION_DETAILS_TIMER(details)->last_trigger = t->last_trigger;
+
+ job_set_activation_details(job, details);
+
+ if (t->stamp_path)
+ touch_file(t->stamp_path, true, t->last_trigger.realtime, UID_INVALID, GID_INVALID, MODE_INVALID);
+
+ timer_set_state(t, TIMER_RUNNING);
+ return;
+
+fail:
+ timer_enter_dead(t, TIMER_FAILURE_RESOURCES);
+}
+
+static int timer_start(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+ assert(IN_SET(t->state, TIMER_DEAD, TIMER_FAILED));
+
+ r = unit_test_trigger_loaded(u);
+ if (r < 0)
+ return r;
+
+ r = unit_acquire_invocation_id(u);
+ if (r < 0)
+ return r;
+
+ t->last_trigger = DUAL_TIMESTAMP_NULL;
+
+ /* Reenable all timers that depend on unit activation time */
+ LIST_FOREACH(value, v, t->values)
+ if (v->base == TIMER_ACTIVE)
+ v->disabled = false;
+
+ if (t->stamp_path) {
+ struct stat st;
+
+ if (stat(t->stamp_path, &st) >= 0) {
+ usec_t ft;
+
+ /* Load the file timestamp, but only if it is actually in the past. If it is in the future,
+ * something is wrong with the system clock. */
+
+ ft = timespec_load(&st.st_mtim);
+ if (ft < now(CLOCK_REALTIME))
+ t->last_trigger.realtime = ft;
+ else
+ log_unit_warning(u, "Not using persistent file timestamp %s as it is in the future.",
+ FORMAT_TIMESTAMP(ft));
+
+ } else if (errno == ENOENT)
+ /* The timer has never run before, make sure a stamp file exists. */
+ (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID);
+ }
+
+ t->result = TIMER_SUCCESS;
+ timer_enter_waiting(t, false);
+ return 1;
+}
+
+static int timer_stop(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(IN_SET(t->state, TIMER_WAITING, TIMER_RUNNING, TIMER_ELAPSED));
+
+ timer_enter_dead(t, TIMER_SUCCESS);
+ return 1;
+}
+
+static int timer_serialize(Unit *u, FILE *f, FDSet *fds) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ (void) serialize_item(f, "state", timer_state_to_string(t->state));
+ (void) serialize_item(f, "result", timer_result_to_string(t->result));
+
+ if (dual_timestamp_is_set(&t->last_trigger))
+ (void) serialize_usec(f, "last-trigger-realtime", t->last_trigger.realtime);
+
+ if (t->last_trigger.monotonic > 0)
+ (void) serialize_usec(f, "last-trigger-monotonic", t->last_trigger.monotonic);
+
+ return 0;
+}
+
+static int timer_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(key);
+ assert(value);
+ assert(fds);
+
+ if (streq(key, "state")) {
+ TimerState state;
+
+ state = timer_state_from_string(value);
+ if (state < 0)
+ log_unit_debug(u, "Failed to parse state value: %s", value);
+ else
+ t->deserialized_state = state;
+
+ } else if (streq(key, "result")) {
+ TimerResult f;
+
+ f = timer_result_from_string(value);
+ if (f < 0)
+ log_unit_debug(u, "Failed to parse result value: %s", value);
+ else if (f != TIMER_SUCCESS)
+ t->result = f;
+
+ } else if (streq(key, "last-trigger-realtime"))
+ (void) deserialize_usec(value, &t->last_trigger.realtime);
+ else if (streq(key, "last-trigger-monotonic"))
+ (void) deserialize_usec(value, &t->last_trigger.monotonic);
+ else
+ log_unit_debug(u, "Unknown serialization key: %s", key);
+
+ return 0;
+}
+
+static UnitActiveState timer_active_state(Unit *u) {
+ assert(u);
+
+ return state_translation_table[TIMER(u)->state];
+}
+
+static const char *timer_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return timer_state_to_string(TIMER(u)->state);
+}
+
+static int timer_dispatch(sd_event_source *s, uint64_t usec, void *userdata) {
+ Timer *t = TIMER(userdata);
+
+ assert(t);
+
+ if (t->state != TIMER_WAITING)
+ return 0;
+
+ log_unit_debug(UNIT(t), "Timer elapsed.");
+ timer_enter_running(t);
+ return 0;
+}
+
+static void timer_trigger_notify(Unit *u, Unit *other) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+ assert(other);
+
+ /* Filter out invocations with bogus state */
+ assert(UNIT_IS_LOAD_COMPLETE(other->load_state));
+
+ /* Reenable all timers that depend on unit state */
+ LIST_FOREACH(value, v, t->values)
+ if (IN_SET(v->base, TIMER_UNIT_ACTIVE, TIMER_UNIT_INACTIVE))
+ v->disabled = false;
+
+ switch (t->state) {
+
+ case TIMER_WAITING:
+ case TIMER_ELAPSED:
+
+ /* Recalculate sleep time */
+ timer_enter_waiting(t, false);
+ break;
+
+ case TIMER_RUNNING:
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ log_unit_debug(UNIT(t), "Got notified about unit deactivation.");
+ timer_enter_waiting(t, false);
+ }
+ break;
+
+ case TIMER_DEAD:
+ case TIMER_FAILED:
+ break;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static void timer_reset_failed(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+
+ if (t->state == TIMER_FAILED)
+ timer_set_state(t, TIMER_DEAD);
+
+ t->result = TIMER_SUCCESS;
+}
+
+static void timer_time_change(Unit *u) {
+ Timer *t = TIMER(u);
+ usec_t ts;
+
+ assert(u);
+
+ if (t->state != TIMER_WAITING)
+ return;
+
+ /* If we appear to have triggered in the future, the system clock must
+ * have been set backwards. So let's rewind our own clock and allow
+ * the future triggers to happen again :). Exactly the same as when
+ * you start a timer unit with Persistent=yes. */
+ ts = now(CLOCK_REALTIME);
+ if (t->last_trigger.realtime > ts)
+ t->last_trigger.realtime = ts;
+
+ if (t->on_clock_change) {
+ log_unit_debug(u, "Time change, triggering activation.");
+ timer_enter_running(t);
+ } else {
+ log_unit_debug(u, "Time change, recalculating next elapse.");
+ timer_enter_waiting(t, true);
+ }
+}
+
+static void timer_timezone_change(Unit *u) {
+ Timer *t = TIMER(u);
+
+ assert(u);
+
+ if (t->state != TIMER_WAITING)
+ return;
+
+ if (t->on_timezone_change) {
+ log_unit_debug(u, "Timezone change, triggering activation.");
+ timer_enter_running(t);
+ } else {
+ log_unit_debug(u, "Timezone change, recalculating next elapse.");
+ timer_enter_waiting(t, false);
+ }
+}
+
+static int timer_clean(Unit *u, ExecCleanMask mask) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+ assert(mask != 0);
+
+ if (t->state != TIMER_DEAD)
+ return -EBUSY;
+
+ if (mask != EXEC_CLEAN_STATE)
+ return -EUNATCH;
+
+ r = timer_setup_persistent(t);
+ if (r < 0)
+ return r;
+
+ if (!t->stamp_path)
+ return -EUNATCH;
+
+ if (unlink(t->stamp_path) && errno != ENOENT)
+ return log_unit_error_errno(u, errno, "Failed to clean stamp file of timer: %m");
+
+ return 0;
+}
+
+static int timer_can_clean(Unit *u, ExecCleanMask *ret) {
+ Timer *t = TIMER(u);
+
+ assert(t);
+ assert(ret);
+
+ *ret = t->persistent ? EXEC_CLEAN_STATE : 0;
+ return 0;
+}
+
+static int timer_can_start(Unit *u) {
+ Timer *t = TIMER(u);
+ int r;
+
+ assert(t);
+
+ r = unit_test_start_limit(u);
+ if (r < 0) {
+ timer_enter_dead(t, TIMER_FAILURE_START_LIMIT_HIT);
+ return r;
+ }
+
+ return 1;
+}
+
+static void activation_details_timer_serialize(ActivationDetails *details, FILE *f) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+
+ assert(details);
+ assert(f);
+ assert(t);
+
+ (void) serialize_dual_timestamp(f, "activation-details-timer-last-trigger", &t->last_trigger);
+}
+
+static int activation_details_timer_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ int r;
+
+ assert(key);
+ assert(value);
+
+ if (!details || !*details)
+ return -EINVAL;
+
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(*details);
+ if (!t)
+ return -EINVAL;
+
+ if (!streq(key, "activation-details-timer-last-trigger"))
+ return -EINVAL;
+
+ r = deserialize_dual_timestamp(value, &t->last_trigger);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int activation_details_timer_append_env(ActivationDetails *details, char ***strv) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(t);
+
+ if (!dual_timestamp_is_set(&t->last_trigger))
+ return 0;
+
+ r = strv_extendf(strv, "TRIGGER_TIMER_REALTIME_USEC=" USEC_FMT, t->last_trigger.realtime);
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, "TRIGGER_TIMER_MONOTONIC_USEC=" USEC_FMT, t->last_trigger.monotonic);
+ if (r < 0)
+ return r;
+
+ return 2; /* Return the number of variables added to the env block */
+}
+
+static int activation_details_timer_append_pair(ActivationDetails *details, char ***strv) {
+ ActivationDetailsTimer *t = ACTIVATION_DETAILS_TIMER(details);
+ int r;
+
+ assert(details);
+ assert(strv);
+ assert(t);
+
+ if (!dual_timestamp_is_set(&t->last_trigger))
+ return 0;
+
+ r = strv_extend(strv, "trigger_timer_realtime_usec");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, USEC_FMT, t->last_trigger.realtime);
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, "trigger_timer_monotonic_usec");
+ if (r < 0)
+ return r;
+
+ r = strv_extendf(strv, USEC_FMT, t->last_trigger.monotonic);
+ if (r < 0)
+ return r;
+
+ return 2; /* Return the number of pairs added to the env block */
+}
+
+uint64_t timer_next_elapse_monotonic(const Timer *t) {
+ assert(t);
+
+ return (uint64_t) usec_shift_clock(t->next_elapse_monotonic_or_boottime,
+ TIMER_MONOTONIC_CLOCK(t), CLOCK_MONOTONIC);
+}
+
+static const char* const timer_base_table[_TIMER_BASE_MAX] = {
+ [TIMER_ACTIVE] = "OnActiveSec",
+ [TIMER_BOOT] = "OnBootSec",
+ [TIMER_STARTUP] = "OnStartupSec",
+ [TIMER_UNIT_ACTIVE] = "OnUnitActiveSec",
+ [TIMER_UNIT_INACTIVE] = "OnUnitInactiveSec",
+ [TIMER_CALENDAR] = "OnCalendar"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_base, TimerBase);
+
+char* timer_base_to_usec_string(TimerBase i) {
+ _cleanup_free_ char *buf = NULL;
+ const char *s;
+ size_t l;
+
+ s = timer_base_to_string(i);
+
+ if (endswith(s, "Sec")) {
+ /* s/Sec/USec/ */
+ l = strlen(s);
+ buf = new(char, l+2);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, s, l-3);
+ memcpy(buf+l-3, "USec", 5);
+ } else {
+ buf = strdup(s);
+ if (!buf)
+ return NULL;
+ }
+
+ return TAKE_PTR(buf);
+}
+
+static const char* const timer_result_table[_TIMER_RESULT_MAX] = {
+ [TIMER_SUCCESS] = "success",
+ [TIMER_FAILURE_RESOURCES] = "resources",
+ [TIMER_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(timer_result, TimerResult);
+
+const UnitVTable timer_vtable = {
+ .object_size = sizeof(Timer),
+
+ .sections =
+ "Unit\0"
+ "Timer\0"
+ "Install\0",
+ .private_section = "Timer",
+
+ .can_transient = true,
+ .can_fail = true,
+ .can_trigger = true,
+
+ .init = timer_init,
+ .done = timer_done,
+ .load = timer_load,
+
+ .coldplug = timer_coldplug,
+
+ .dump = timer_dump,
+
+ .start = timer_start,
+ .stop = timer_stop,
+
+ .clean = timer_clean,
+ .can_clean = timer_can_clean,
+
+ .serialize = timer_serialize,
+ .deserialize_item = timer_deserialize_item,
+
+ .active_state = timer_active_state,
+ .sub_state_to_string = timer_sub_state_to_string,
+
+ .trigger_notify = timer_trigger_notify,
+
+ .reset_failed = timer_reset_failed,
+ .time_change = timer_time_change,
+ .timezone_change = timer_timezone_change,
+
+ .bus_set_property = bus_timer_set_property,
+
+ .can_start = timer_can_start,
+};
+
+const ActivationDetailsVTable activation_details_timer_vtable = {
+ .object_size = sizeof(ActivationDetailsTimer),
+
+ .serialize = activation_details_timer_serialize,
+ .deserialize = activation_details_timer_deserialize,
+ .append_env = activation_details_timer_append_env,
+ .append_pair = activation_details_timer_append_pair,
+};
diff --git a/src/core/timer.h b/src/core/timer.h
new file mode 100644
index 0000000..76d45b2
--- /dev/null
+++ b/src/core/timer.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Timer Timer;
+typedef struct ActivationDetailsTimer ActivationDetailsTimer;
+
+#include "calendarspec.h"
+#include "unit.h"
+
+typedef enum TimerBase {
+ TIMER_ACTIVE,
+ TIMER_BOOT,
+ TIMER_STARTUP,
+ TIMER_UNIT_ACTIVE,
+ TIMER_UNIT_INACTIVE,
+ TIMER_CALENDAR,
+ _TIMER_BASE_MAX,
+ _TIMER_BASE_INVALID = -EINVAL,
+} TimerBase;
+
+typedef struct TimerValue {
+ TimerBase base;
+ bool disabled;
+
+ usec_t value; /* only for monotonic events */
+ CalendarSpec *calendar_spec; /* only for calendar events */
+ usec_t next_elapse;
+
+ LIST_FIELDS(struct TimerValue, value);
+} TimerValue;
+
+typedef enum TimerResult {
+ TIMER_SUCCESS,
+ TIMER_FAILURE_RESOURCES,
+ TIMER_FAILURE_START_LIMIT_HIT,
+ _TIMER_RESULT_MAX,
+ _TIMER_RESULT_INVALID = -EINVAL,
+} TimerResult;
+
+struct Timer {
+ Unit meta;
+
+ usec_t accuracy_usec;
+ usec_t random_usec;
+
+ LIST_HEAD(TimerValue, values);
+ usec_t next_elapse_realtime;
+ usec_t next_elapse_monotonic_or_boottime;
+ dual_timestamp last_trigger;
+
+ TimerState state, deserialized_state;
+
+ sd_event_source *monotonic_event_source;
+ sd_event_source *realtime_event_source;
+
+ TimerResult result;
+
+ bool persistent;
+ bool wake_system;
+ bool remain_after_elapse;
+ bool on_clock_change;
+ bool on_timezone_change;
+ bool fixed_random_delay;
+
+ char *stamp_path;
+};
+
+struct ActivationDetailsTimer {
+ ActivationDetails meta;
+ dual_timestamp last_trigger;
+};
+
+#define TIMER_MONOTONIC_CLOCK(t) ((t)->wake_system ? CLOCK_BOOTTIME_ALARM : CLOCK_MONOTONIC)
+
+uint64_t timer_next_elapse_monotonic(const Timer *t);
+
+void timer_free_values(Timer *t);
+
+extern const UnitVTable timer_vtable;
+extern const ActivationDetailsVTable activation_details_timer_vtable;
+
+const char *timer_base_to_string(TimerBase i) _const_;
+TimerBase timer_base_from_string(const char *s) _pure_;
+
+char* timer_base_to_usec_string(TimerBase i);
+
+const char* timer_result_to_string(TimerResult i) _const_;
+TimerResult timer_result_from_string(const char *s) _pure_;
+
+DEFINE_CAST(TIMER, Timer);
+DEFINE_ACTIVATION_DETAILS_CAST(ACTIVATION_DETAILS_TIMER, ActivationDetailsTimer, TIMER);
diff --git a/src/core/transaction.c b/src/core/transaction.c
new file mode 100644
index 0000000..a81c40f
--- /dev/null
+++ b/src/core/transaction.c
@@ -0,0 +1,1261 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bus-common-errors.h"
+#include "bus-error.h"
+#include "dbus-unit.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "transaction.h"
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies);
+
+static void transaction_delete_job(Transaction *tr, Job *j, bool delete_dependencies) {
+ assert(tr);
+ assert(j);
+
+ /* Deletes one job from the transaction */
+
+ transaction_unlink_job(tr, j, delete_dependencies);
+
+ job_free(j);
+}
+
+static void transaction_delete_unit(Transaction *tr, Unit *u) {
+ Job *j;
+
+ /* Deletes all jobs associated with a certain unit from the
+ * transaction */
+
+ while ((j = hashmap_get(tr->jobs, u)))
+ transaction_delete_job(tr, j, true);
+}
+
+static void transaction_abort(Transaction *tr) {
+ Job *j;
+
+ assert(tr);
+
+ while ((j = hashmap_first(tr->jobs)))
+ transaction_delete_job(tr, j, false);
+
+ assert(hashmap_isempty(tr->jobs));
+}
+
+static void transaction_find_jobs_that_matter_to_anchor(Job *j, unsigned generation) {
+ assert(j);
+
+ /* A recursive sweep through the graph that marks all units
+ * that matter to the anchor job, i.e. are directly or
+ * indirectly a dependency of the anchor job via paths that
+ * are fully marked as mattering. */
+
+ j->matters_to_anchor = true;
+ j->generation = generation;
+
+ LIST_FOREACH(subject, l, j->subject_list) {
+
+ /* This link does not matter */
+ if (!l->matters)
+ continue;
+
+ /* This unit has already been marked */
+ if (l->object->generation == generation)
+ continue;
+
+ transaction_find_jobs_that_matter_to_anchor(l->object, generation);
+ }
+}
+
+static void transaction_merge_and_delete_job(Transaction *tr, Job *j, Job *other, JobType t) {
+ JobDependency *last;
+
+ assert(j);
+ assert(other);
+ assert(j->unit == other->unit);
+ assert(!j->installed);
+
+ /* Merges 'other' into 'j' and then deletes 'other'. */
+
+ j->type = t;
+ j->state = JOB_WAITING;
+ j->irreversible = j->irreversible || other->irreversible;
+ j->matters_to_anchor = j->matters_to_anchor || other->matters_to_anchor;
+
+ /* Patch us in as new owner of the JobDependency objects */
+ last = NULL;
+ LIST_FOREACH(subject, l, other->subject_list) {
+ assert(l->subject == other);
+ l->subject = j;
+ last = l;
+ }
+
+ /* Merge both lists */
+ if (last) {
+ last->subject_next = j->subject_list;
+ if (j->subject_list)
+ j->subject_list->subject_prev = last;
+ j->subject_list = other->subject_list;
+ }
+
+ /* Patch us in as new owner of the JobDependency objects */
+ last = NULL;
+ LIST_FOREACH(object, l, other->object_list) {
+ assert(l->object == other);
+ l->object = j;
+ last = l;
+ }
+
+ /* Merge both lists */
+ if (last) {
+ last->object_next = j->object_list;
+ if (j->object_list)
+ j->object_list->object_prev = last;
+ j->object_list = other->object_list;
+ }
+
+ /* Kill the other job */
+ other->subject_list = NULL;
+ other->object_list = NULL;
+ transaction_delete_job(tr, other, true);
+}
+
+static bool job_is_conflicted_by(Job *j) {
+ assert(j);
+
+ /* Returns true if this job is pulled in by a least one
+ * ConflictedBy dependency. */
+
+ LIST_FOREACH(object, l, j->object_list)
+ if (l->conflicts)
+ return true;
+
+ return false;
+}
+
+static int delete_one_unmergeable_job(Transaction *tr, Job *job) {
+ assert(job);
+
+ /* Tries to delete one item in the linked list
+ * j->transaction_next->transaction_next->... that conflicts
+ * with another one, in an attempt to make an inconsistent
+ * transaction work. */
+
+ /* We rely here on the fact that if a merged with b does not
+ * merge with c, either a or b merge with c neither */
+ LIST_FOREACH(transaction, j, job)
+ LIST_FOREACH(transaction, k, j->transaction_next) {
+ Job *d;
+
+ /* Is this one mergeable? Then skip it */
+ if (job_type_is_mergeable(j->type, k->type))
+ continue;
+
+ /* Ok, we found two that conflict, let's see if we can
+ * drop one of them */
+ if (!j->matters_to_anchor && !k->matters_to_anchor) {
+
+ /* Both jobs don't matter, so let's
+ * find the one that is smarter to
+ * remove. Let's think positive and
+ * rather remove stops then starts --
+ * except if something is being
+ * stopped because it is conflicted by
+ * another unit in which case we
+ * rather remove the start. */
+
+ log_unit_debug(j->unit,
+ "Looking at job %s/%s conflicted_by=%s",
+ j->unit->id, job_type_to_string(j->type),
+ yes_no(j->type == JOB_STOP && job_is_conflicted_by(j)));
+ log_unit_debug(k->unit,
+ "Looking at job %s/%s conflicted_by=%s",
+ k->unit->id, job_type_to_string(k->type),
+ yes_no(k->type == JOB_STOP && job_is_conflicted_by(k)));
+
+ if (j->type == JOB_STOP) {
+
+ if (job_is_conflicted_by(j))
+ d = k;
+ else
+ d = j;
+
+ } else if (k->type == JOB_STOP) {
+
+ if (job_is_conflicted_by(k))
+ d = j;
+ else
+ d = k;
+ } else
+ d = j;
+
+ } else if (!j->matters_to_anchor)
+ d = j;
+ else if (!k->matters_to_anchor)
+ d = k;
+ else
+ return -ENOEXEC;
+
+ /* Ok, we can drop one, so let's do so. */
+ log_unit_debug(d->unit,
+ "Fixing conflicting jobs %s/%s,%s/%s by deleting job %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ k->unit->id, job_type_to_string(k->type),
+ d->unit->id, job_type_to_string(d->type));
+ transaction_delete_job(tr, d, true);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int transaction_merge_jobs(Transaction *tr, sd_bus_error *e) {
+ Job *j;
+ int r;
+
+ assert(tr);
+
+ /* First step, check whether any of the jobs for one specific
+ * task conflict. If so, try to drop one of them. */
+ HASHMAP_FOREACH(j, tr->jobs) {
+ JobType t;
+
+ t = j->type;
+ LIST_FOREACH(transaction, k, j->transaction_next) {
+ if (job_type_merge_and_collapse(&t, k->type, j->unit) >= 0)
+ continue;
+
+ /* OK, we could not merge all jobs for this
+ * action. Let's see if we can get rid of one
+ * of them */
+
+ r = delete_one_unmergeable_job(tr, j);
+ if (r >= 0)
+ /* Ok, we managed to drop one, now
+ * let's ask our callers to call us
+ * again after garbage collecting */
+ return -EAGAIN;
+
+ /* We couldn't merge anything. Failure */
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_JOBS_CONFLICTING,
+ "Transaction contains conflicting jobs '%s' and '%s' for %s. "
+ "Probably contradicting requirement dependencies configured.",
+ job_type_to_string(t),
+ job_type_to_string(k->type),
+ k->unit->id);
+ }
+ }
+
+ /* Second step, merge the jobs. */
+ HASHMAP_FOREACH(j, tr->jobs) {
+ JobType t = j->type;
+
+ /* Merge all transaction jobs for j->unit */
+ LIST_FOREACH(transaction, k, j->transaction_next)
+ assert_se(job_type_merge_and_collapse(&t, k->type, j->unit) == 0);
+
+ Job *k;
+ while ((k = j->transaction_next)) {
+ if (tr->anchor_job == k) {
+ transaction_merge_and_delete_job(tr, k, j, t);
+ j = k;
+ } else
+ transaction_merge_and_delete_job(tr, j, k, t);
+ }
+
+ assert(!j->transaction_next);
+ assert(!j->transaction_prev);
+ }
+
+ return 0;
+}
+
+static void transaction_drop_redundant(Transaction *tr) {
+ bool again;
+
+ /* Goes through the transaction and removes all jobs of the units whose jobs are all noops. If not
+ * all of a unit's jobs are redundant, they are kept. */
+
+ assert(tr);
+
+ do {
+ Job *j;
+
+ again = false;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ bool keep = false;
+
+ LIST_FOREACH(transaction, k, j)
+ if (tr->anchor_job == k ||
+ !job_type_is_redundant(k->type, unit_active_state(k->unit)) ||
+ (k->unit->job && job_type_is_conflicting(k->type, k->unit->job->type))) {
+ keep = true;
+ break;
+ }
+
+ if (!keep) {
+ log_trace("Found redundant job %s/%s, dropping from transaction.",
+ j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, j, false);
+ again = true;
+ break;
+ }
+ }
+ } while (again);
+}
+
+static bool job_matters_to_anchor(Job *job) {
+ assert(job);
+ assert(!job->transaction_prev);
+
+ /* Checks whether at least one of the jobs for this transaction matters to the anchor. */
+
+ LIST_FOREACH(transaction, j, job)
+ if (j->matters_to_anchor)
+ return true;
+
+ return false;
+}
+
+static char* merge_unit_ids(const char* unit_log_field, char * const* pairs) {
+ _cleanup_free_ char *ans = NULL;
+ size_t size = 0;
+
+ assert(unit_log_field);
+
+ STRV_FOREACH_PAIR(unit_id, job_type, pairs) {
+ size_t next;
+
+ if (size > 0)
+ ans[size - 1] = '\n';
+
+ next = strlen(unit_log_field) + strlen(*unit_id);
+ if (!GREEDY_REALLOC(ans, size + next + 1))
+ return NULL;
+
+ sprintf(ans + size, "%s%s", unit_log_field, *unit_id);
+ size += next + 1;
+ }
+
+ if (!ans)
+ return strdup("");
+
+ return TAKE_PTR(ans);
+}
+
+static int transaction_verify_order_one(Transaction *tr, Job *j, Job *from, unsigned generation, sd_bus_error *e) {
+
+ static const UnitDependencyAtom directions[] = {
+ UNIT_ATOM_BEFORE,
+ UNIT_ATOM_AFTER,
+ };
+
+ int r;
+
+ assert(tr);
+ assert(j);
+ assert(!j->transaction_prev);
+
+ /* Does a recursive sweep through the ordering graph, looking for a cycle. If we find a cycle we try
+ * to break it. */
+
+ /* Have we seen this before? */
+ if (j->generation == generation) {
+ Job *k, *delete = NULL;
+ _cleanup_free_ char **array = NULL, *unit_ids = NULL;
+
+ /* If the marker is NULL we have been here already and decided the job was loop-free from
+ * here. Hence shortcut things and return right-away. */
+ if (!j->marker)
+ return 0;
+
+ /* So, the marker is not NULL and we already have been here. We have a cycle. Let's try to
+ * break it. We go backwards in our path and try to find a suitable job to remove. We use the
+ * marker to find our way back, since smart how we are we stored our way back in there. */
+ for (k = from; k; k = ((k->generation == generation && k->marker != k) ? k->marker : NULL)) {
+
+ /* For logging below */
+ if (strv_push_pair(&array, k->unit->id, (char*) job_type_to_string(k->type)) < 0)
+ log_oom();
+
+ if (!delete && hashmap_contains(tr->jobs, k->unit) && !job_matters_to_anchor(k))
+ /* Ok, we can drop this one, so let's do so. */
+ delete = k;
+
+ /* Check if this in fact was the beginning of the cycle */
+ if (k == j)
+ break;
+ }
+
+ unit_ids = merge_unit_ids(j->manager->unit_log_field, array); /* ignore error */
+
+ STRV_FOREACH_PAIR(unit_id, job_type, array)
+ /* logging for j not k here to provide a consistent narrative */
+ log_struct(LOG_WARNING,
+ LOG_UNIT_MESSAGE(j->unit,
+ "Found %s on %s/%s",
+ unit_id == array ? "ordering cycle" : "dependency",
+ *unit_id, *job_type),
+ "%s", strna(unit_ids));
+
+ if (delete) {
+ const char *status;
+ /* logging for j not k here to provide a consistent narrative */
+ log_struct(LOG_ERR,
+ LOG_UNIT_MESSAGE(j->unit,
+ "Job %s/%s deleted to break ordering cycle starting with %s/%s",
+ delete->unit->id, job_type_to_string(delete->type),
+ j->unit->id, job_type_to_string(j->type)),
+ "%s", strna(unit_ids));
+
+ if (log_get_show_color())
+ status = ANSI_HIGHLIGHT_RED " SKIP " ANSI_NORMAL;
+ else
+ status = " SKIP ";
+
+ unit_status_printf(delete->unit,
+ STATUS_TYPE_NOTICE,
+ status,
+ "Ordering cycle found, skipping %s",
+ unit_status_string(delete->unit, NULL));
+ transaction_delete_unit(tr, delete->unit);
+ return -EAGAIN;
+ }
+
+ log_struct(LOG_ERR,
+ LOG_UNIT_MESSAGE(j->unit, "Unable to break cycle starting with %s/%s",
+ j->unit->id, job_type_to_string(j->type)),
+ "%s", strna(unit_ids));
+
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_ORDER_IS_CYCLIC,
+ "Transaction order is cyclic. See system logs for details.");
+ }
+
+ /* Make the marker point to where we come from, so that we can
+ * find our way backwards if we want to break a cycle. We use
+ * a special marker for the beginning: we point to
+ * ourselves. */
+ j->marker = from ?: j;
+ j->generation = generation;
+
+ /* Actual ordering of jobs depends on the unit ordering dependency and job types. We need to traverse
+ * the graph over 'before' edges in the actual job execution order. We traverse over both unit
+ * ordering dependencies and we test with job_compare() whether it is the 'before' edge in the job
+ * execution ordering. */
+ for (size_t d = 0; d < ELEMENTSOF(directions); d++) {
+ Unit *u;
+
+ UNIT_FOREACH_DEPENDENCY(u, j->unit, directions[d]) {
+ Job *o;
+
+ /* Is there a job for this unit? */
+ o = hashmap_get(tr->jobs, u);
+ if (!o) {
+ /* Ok, there is no job for this in the transaction, but maybe there is
+ * already one running? */
+ o = u->job;
+ if (!o)
+ continue;
+ }
+
+ /* Cut traversing if the job j is not really *before* o. */
+ if (job_compare(j, o, directions[d]) >= 0)
+ continue;
+
+ r = transaction_verify_order_one(tr, o, j, generation, e);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Ok, let's backtrack, and remember that this entry is not on
+ * our path anymore. */
+ j->marker = NULL;
+
+ return 0;
+}
+
+static int transaction_verify_order(Transaction *tr, unsigned *generation, sd_bus_error *e) {
+ Job *j;
+ int r;
+ unsigned g;
+
+ assert(tr);
+ assert(generation);
+
+ /* Check if the ordering graph is cyclic. If it is, try to fix
+ * that up by dropping one of the jobs. */
+
+ g = (*generation)++;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ r = transaction_verify_order_one(tr, j, NULL, g, e);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void transaction_collect_garbage(Transaction *tr) {
+ bool again;
+
+ assert(tr);
+
+ /* Drop jobs that are not required by any other job */
+
+ do {
+ Job *j;
+
+ again = false;
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ if (tr->anchor_job == j)
+ continue;
+
+ if (!j->object_list) {
+ log_trace("Garbage collecting job %s/%s", j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, j, true);
+ again = true;
+ break;
+ }
+
+ log_trace("Keeping job %s/%s because of %s/%s",
+ j->unit->id, job_type_to_string(j->type),
+ j->object_list->subject ? j->object_list->subject->unit->id : "root",
+ j->object_list->subject ? job_type_to_string(j->object_list->subject->type) : "root");
+ }
+
+ } while (again);
+}
+
+static int transaction_is_destructive(Transaction *tr, JobMode mode, sd_bus_error *e) {
+ Job *j;
+
+ assert(tr);
+
+ /* Checks whether applying this transaction means that
+ * existing jobs would be replaced */
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+
+ /* Assume merged */
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+
+ if (j->unit->job && (mode == JOB_FAIL || j->unit->job->irreversible) &&
+ job_type_is_conflicting(j->unit->job->type, j->type))
+ return sd_bus_error_setf(e, BUS_ERROR_TRANSACTION_IS_DESTRUCTIVE,
+ "Transaction for %s/%s is destructive (%s has '%s' job queued, but '%s' is included in transaction).",
+ tr->anchor_job->unit->id, job_type_to_string(tr->anchor_job->type),
+ j->unit->id, job_type_to_string(j->unit->job->type), job_type_to_string(j->type));
+ }
+
+ return 0;
+}
+
+static void transaction_minimize_impact(Transaction *tr) {
+ Job *head;
+
+ assert(tr);
+
+ /* Drops all unnecessary jobs that reverse already active jobs
+ * or that stop a running service. */
+
+rescan:
+ HASHMAP_FOREACH(head, tr->jobs) {
+ LIST_FOREACH(transaction, j, head) {
+ bool stops_running_service, changes_existing_job;
+
+ /* If it matters, we shouldn't drop it */
+ if (j->matters_to_anchor)
+ continue;
+
+ /* Would this stop a running service?
+ * Would this change an existing job?
+ * If so, let's drop this entry */
+
+ stops_running_service =
+ j->type == JOB_STOP && UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(j->unit));
+
+ changes_existing_job =
+ j->unit->job &&
+ job_type_is_conflicting(j->type, j->unit->job->type);
+
+ if (!stops_running_service && !changes_existing_job)
+ continue;
+
+ if (stops_running_service)
+ log_unit_debug(j->unit,
+ "%s/%s would stop a running service.",
+ j->unit->id, job_type_to_string(j->type));
+
+ if (changes_existing_job)
+ log_unit_debug(j->unit,
+ "%s/%s would change existing job.",
+ j->unit->id, job_type_to_string(j->type));
+
+ /* Ok, let's get rid of this */
+ log_unit_debug(j->unit,
+ "Deleting %s/%s to minimize impact.",
+ j->unit->id, job_type_to_string(j->type));
+
+ transaction_delete_job(tr, j, true);
+ goto rescan;
+ }
+ }
+}
+
+static int transaction_apply(
+ Transaction *tr,
+ Manager *m,
+ JobMode mode,
+ Set *affected_jobs) {
+
+ Job *j;
+ int r;
+
+ /* Moves the transaction jobs to the set of active jobs */
+
+ if (IN_SET(mode, JOB_ISOLATE, JOB_FLUSH)) {
+
+ /* When isolating first kill all installed jobs which
+ * aren't part of the new transaction */
+ HASHMAP_FOREACH(j, m->jobs) {
+ assert(j->installed);
+
+ if (j->unit->ignore_on_isolate)
+ continue;
+
+ if (hashmap_contains(tr->jobs, j->unit))
+ continue;
+
+ /* Not invalidating recursively. Avoids triggering
+ * OnFailure= actions of dependent jobs. Also avoids
+ * invalidating our iterator. */
+ job_finish_and_invalidate(j, JOB_CANCELED, false, false);
+ }
+ }
+
+ HASHMAP_FOREACH(j, tr->jobs) {
+ /* Assume merged */
+ assert(!j->transaction_prev);
+ assert(!j->transaction_next);
+
+ r = hashmap_ensure_put(&m->jobs, NULL, UINT32_TO_PTR(j->id), j);
+ if (r < 0)
+ goto rollback;
+ }
+
+ while ((j = hashmap_steal_first(tr->jobs))) {
+ Job *installed_job;
+
+ /* Clean the job dependencies */
+ transaction_unlink_job(tr, j, false);
+
+ /* When RestartMode=direct is used, the service being restarted don't enter the inactive/failed
+ * state, i.e. unit_process_job -> job_finish_and_invalidate is never called, and the previous
+ * job might still be running (especially for Type=oneshot services). We need to refuse
+ * late merge and re-enqueue the anchor job. */
+ installed_job = job_install(j,
+ /* refuse_late_merge = */ mode == JOB_RESTART_DEPENDENCIES && j == tr->anchor_job);
+ if (installed_job != j) {
+ /* j has been merged into a previously installed job */
+ if (tr->anchor_job == j)
+ tr->anchor_job = installed_job;
+
+ hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+ free_and_replace_full(j, installed_job, job_free);
+ }
+
+ job_add_to_run_queue(j);
+ job_add_to_dbus_queue(j);
+ job_start_timer(j, false);
+ job_shutdown_magic(j);
+
+ /* When 'affected' is specified, let's track all in it all jobs that were touched because of
+ * this transaction. */
+ if (affected_jobs)
+ (void) set_put(affected_jobs, j);
+ }
+
+ return 0;
+
+rollback:
+
+ HASHMAP_FOREACH(j, tr->jobs)
+ hashmap_remove_value(m->jobs, UINT32_TO_PTR(j->id), j);
+
+ return r;
+}
+
+int transaction_activate(
+ Transaction *tr,
+ Manager *m,
+ JobMode mode,
+ Set *affected_jobs,
+ sd_bus_error *e) {
+
+ Job *j;
+ int r;
+ unsigned generation = 1;
+
+ assert(tr);
+
+ /* This applies the changes recorded in tr->jobs to
+ * the actual list of jobs, if possible. */
+
+ /* Reset the generation counter of all installed jobs. The detection of cycles
+ * looks at installed jobs. If they had a non-zero generation from some previous
+ * walk of the graph, the algorithm would break. */
+ HASHMAP_FOREACH(j, m->jobs)
+ j->generation = 0;
+
+ /* First step: figure out which jobs matter */
+ transaction_find_jobs_that_matter_to_anchor(tr->anchor_job, generation++);
+
+ /* Second step: Try not to stop any running services if
+ * we don't have to. Don't try to reverse running
+ * jobs if we don't have to. */
+ if (mode == JOB_FAIL)
+ transaction_minimize_impact(tr);
+
+ /* Third step: Drop redundant jobs */
+ transaction_drop_redundant(tr);
+
+ for (;;) {
+ /* Fourth step: Let's remove unneeded jobs that might
+ * be lurking. */
+ if (mode != JOB_ISOLATE)
+ transaction_collect_garbage(tr);
+
+ /* Fifth step: verify order makes sense and correct
+ * cycles if necessary and possible */
+ r = transaction_verify_order(tr, &generation, e);
+ if (r >= 0)
+ break;
+
+ if (r != -EAGAIN)
+ return log_warning_errno(r, "Requested transaction contains an unfixable cyclic ordering dependency: %s", bus_error_message(e, r));
+
+ /* Let's see if the resulting transaction ordering
+ * graph is still cyclic... */
+ }
+
+ for (;;) {
+ /* Sixth step: let's drop unmergeable entries if
+ * necessary and possible, merge entries we can
+ * merge */
+ r = transaction_merge_jobs(tr, e);
+ if (r >= 0)
+ break;
+
+ if (r != -EAGAIN)
+ return log_warning_errno(r, "Requested transaction contains unmergeable jobs: %s", bus_error_message(e, r));
+
+ /* Seventh step: an entry got dropped, let's garbage
+ * collect its dependencies. */
+ if (mode != JOB_ISOLATE)
+ transaction_collect_garbage(tr);
+
+ /* Let's see if the resulting transaction still has
+ * unmergeable entries ... */
+ }
+
+ /* Eights step: Drop redundant jobs again, if the merging now allows us to drop more. */
+ transaction_drop_redundant(tr);
+
+ /* Ninth step: check whether we can actually apply this */
+ r = transaction_is_destructive(tr, mode, e);
+ if (r < 0)
+ return log_notice_errno(r, "Requested transaction contradicts existing jobs: %s", bus_error_message(e, r));
+
+ /* Tenth step: apply changes */
+ r = transaction_apply(tr, m, mode, affected_jobs);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to apply transaction: %m");
+
+ assert(hashmap_isempty(tr->jobs));
+
+ /* Are there any jobs now? Then make sure we have the idle pipe around. We don't really care too much
+ * whether this works or not, as the idle pipe is a feature for cosmetics, not actually useful for
+ * anything beyond that. */
+ if (!hashmap_isempty(m->jobs))
+ (void) manager_allocate_idle_pipe(m);
+
+ return 0;
+}
+
+static Job* transaction_add_one_job(Transaction *tr, JobType type, Unit *unit, bool *is_new) {
+ Job *j, *f;
+
+ assert(tr);
+ assert(unit);
+
+ /* Looks for an existing prospective job and returns that. If
+ * it doesn't exist it is created and added to the prospective
+ * jobs list. */
+
+ f = hashmap_get(tr->jobs, unit);
+
+ LIST_FOREACH(transaction, i, f) {
+ assert(i->unit == unit);
+
+ if (i->type == type) {
+ if (is_new)
+ *is_new = false;
+ return i;
+ }
+ }
+
+ j = job_new(unit, type);
+ if (!j)
+ return NULL;
+
+ j->generation = 0;
+ j->marker = NULL;
+ j->matters_to_anchor = false;
+ j->irreversible = tr->irreversible;
+
+ LIST_PREPEND(transaction, f, j);
+
+ if (hashmap_replace(tr->jobs, unit, f) < 0) {
+ LIST_REMOVE(transaction, f, j);
+ job_free(j);
+ return NULL;
+ }
+
+ if (is_new)
+ *is_new = true;
+
+ log_trace("Added job %s/%s to transaction.", unit->id, job_type_to_string(type));
+
+ return j;
+}
+
+static void transaction_unlink_job(Transaction *tr, Job *j, bool delete_dependencies) {
+ assert(tr);
+ assert(j);
+
+ if (j->transaction_prev)
+ j->transaction_prev->transaction_next = j->transaction_next;
+ else if (j->transaction_next)
+ hashmap_replace(tr->jobs, j->unit, j->transaction_next);
+ else
+ hashmap_remove_value(tr->jobs, j->unit, j);
+
+ if (j->transaction_next)
+ j->transaction_next->transaction_prev = j->transaction_prev;
+
+ j->transaction_prev = j->transaction_next = NULL;
+
+ while (j->subject_list)
+ job_dependency_free(j->subject_list);
+
+ while (j->object_list) {
+ Job *other = j->object_list->matters ? j->object_list->subject : NULL;
+
+ job_dependency_free(j->object_list);
+
+ if (other && delete_dependencies) {
+ log_unit_debug(other->unit,
+ "Deleting job %s/%s as dependency of job %s/%s",
+ other->unit->id, job_type_to_string(other->type),
+ j->unit->id, job_type_to_string(j->type));
+ transaction_delete_job(tr, other, delete_dependencies);
+ }
+ }
+}
+
+void transaction_add_propagate_reload_jobs(
+ Transaction *tr,
+ Unit *unit,
+ Job *by,
+ TransactionAddFlags flags) {
+
+ JobType nt;
+ Unit *dep;
+ int r;
+
+ assert(tr);
+ assert(unit);
+
+ UNIT_FOREACH_DEPENDENCY(dep, unit, UNIT_ATOM_PROPAGATES_RELOAD_TO) {
+ _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+
+ nt = job_type_collapse(JOB_TRY_RELOAD, dep);
+ if (nt == JOB_NOP)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, nt, dep, by, flags, &e);
+ if (r < 0)
+ log_unit_warning(dep,
+ "Cannot add dependency reload job, ignoring: %s",
+ bus_error_message(&e, r));
+ }
+}
+
+static JobType job_type_propagate_stop_graceful(Job *j) {
+ JobType type;
+
+ if (!j)
+ return JOB_STOP;
+
+ type = JOB_STOP;
+
+ LIST_FOREACH(transaction, i, j)
+ switch (i->type) {
+
+ case JOB_STOP:
+ case JOB_RESTART:
+ /* Nothing to worry about, an appropriate job is in-place */
+ return JOB_NOP;
+
+ case JOB_START:
+ /* This unit is pulled in by other dependency types in this transaction. We will run
+ * into job type conflict if we enqueue a stop job, so let's enqueue a restart job
+ * instead. */
+ type = JOB_RESTART;
+ break;
+
+ default: /* We don't care about others */
+ ;
+
+ }
+
+ return type;
+}
+
+int transaction_add_job_and_dependencies(
+ Transaction *tr,
+ JobType type,
+ Unit *unit,
+ Job *by,
+ TransactionAddFlags flags,
+ sd_bus_error *e) {
+
+ bool is_new;
+ Job *ret;
+ int r;
+
+ assert(tr);
+ assert(type < _JOB_TYPE_MAX);
+ assert(type < _JOB_TYPE_MAX_IN_TRANSACTION);
+ assert(unit);
+
+ /* Before adding jobs for this unit, let's ensure that its state has been loaded This matters when
+ * jobs are spawned as part of coldplugging itself (see e. g. path_coldplug()). This way, we
+ * "recursively" coldplug units, ensuring that we do not look at state of not-yet-coldplugged
+ * units. */
+ if (MANAGER_IS_RELOADING(unit->manager))
+ unit_coldplug(unit);
+
+ if (by)
+ log_trace("Pulling in %s/%s from %s/%s", unit->id, job_type_to_string(type), by->unit->id, job_type_to_string(by->type));
+
+ /* Safety check that the unit is a valid state, i.e. not in UNIT_STUB or UNIT_MERGED which should only be set
+ * temporarily. */
+ if (!UNIT_IS_LOAD_COMPLETE(unit->load_state))
+ return sd_bus_error_setf(e, BUS_ERROR_LOAD_FAILED, "Unit %s is not loaded properly.", unit->id);
+
+ if (type != JOB_STOP) {
+ r = bus_unit_validate_load_state(unit, e);
+ /* The time-based cache allows to start new units without daemon-reload, but if they are
+ * already referenced (because of dependencies or ordering) then we have to force a load of
+ * the fragment. As an optimization, check first if anything in the usual paths was modified
+ * since the last time the cache was loaded. Also check if the last time an attempt to load
+ * the unit was made was before the most recent cache refresh, so that we know we need to try
+ * again — even if the cache is current, it might have been updated in a different context
+ * before we had a chance to retry loading this particular unit.
+ *
+ * Given building up the transaction is a synchronous operation, attempt
+ * to load the unit immediately. */
+ if (r < 0 && manager_unit_cache_should_retry_load(unit)) {
+ sd_bus_error_free(e);
+ unit->load_state = UNIT_STUB;
+ r = unit_load(unit);
+ if (r < 0 || unit->load_state == UNIT_STUB)
+ unit->load_state = UNIT_NOT_FOUND;
+ r = bus_unit_validate_load_state(unit, e);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ if (!unit_job_is_applicable(unit, type))
+ return sd_bus_error_setf(e, BUS_ERROR_JOB_TYPE_NOT_APPLICABLE,
+ "Job type %s is not applicable for unit %s.",
+ job_type_to_string(type), unit->id);
+
+ /* First add the job. */
+ ret = transaction_add_one_job(tr, type, unit, &is_new);
+ if (!ret)
+ return -ENOMEM;
+
+ if (FLAGS_SET(flags, TRANSACTION_IGNORE_ORDER))
+ ret->ignore_order = true;
+
+ /* Then, add a link to the job. */
+ if (by) {
+ if (!job_dependency_new(by, ret, FLAGS_SET(flags, TRANSACTION_MATTERS), FLAGS_SET(flags, TRANSACTION_CONFLICTS)))
+ return -ENOMEM;
+ } else {
+ /* If the job has no parent job, it is the anchor job. */
+ assert(!tr->anchor_job);
+ tr->anchor_job = ret;
+ }
+
+ if (!is_new || FLAGS_SET(flags, TRANSACTION_IGNORE_REQUIREMENTS) || type == JOB_NOP)
+ return 0;
+
+ _cleanup_set_free_ Set *following = NULL;
+ Unit *dep;
+
+ /* If we are following some other unit, make sure we add all dependencies of everybody following. */
+ if (unit_following_set(ret->unit, &following) > 0)
+ SET_FOREACH(dep, following) {
+ r = transaction_add_job_and_dependencies(tr, type, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+ if (r < 0) {
+ log_unit_full_errno(dep, r == -ERFKILL ? LOG_INFO : LOG_WARNING, r,
+ "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+
+ /* Finally, recursively add in all dependencies. */
+ if (IN_SET(type, JOB_START, JOB_RESTART)) {
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START) {
+ r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_START_IGNORED) {
+ r = transaction_add_job_and_dependencies(tr, JOB_START, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+ if (r < 0) {
+ /* unit masked, job type not applicable and unit not found are not considered
+ * as errors. */
+ log_unit_full_errno(dep,
+ IN_SET(r, -ERFKILL, -EBADR, -ENOENT) ? LOG_DEBUG : LOG_WARNING,
+ r, "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_VERIFY) {
+ r = transaction_add_job_and_dependencies(tr, JOB_VERIFY_ACTIVE, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP) {
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, TRANSACTION_MATTERS | TRANSACTION_CONFLICTS | (flags & TRANSACTION_IGNORE_ORDER), e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ goto fail;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PULL_IN_STOP_IGNORED) {
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, dep, ret, flags & TRANSACTION_IGNORE_ORDER, e);
+ if (r < 0) {
+ log_unit_warning(dep,
+ "Cannot add dependency job, ignoring: %s",
+ bus_error_message(e, r));
+ sd_bus_error_free(e);
+ }
+ }
+ }
+
+ if (IN_SET(type, JOB_RESTART, JOB_STOP) || (type == JOB_START && FLAGS_SET(flags, TRANSACTION_PROPAGATE_START_AS_RESTART))) {
+ bool is_stop = type == JOB_STOP;
+
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, is_stop ? UNIT_ATOM_PROPAGATE_STOP : UNIT_ATOM_PROPAGATE_RESTART) {
+ /* We propagate RESTART only as TRY_RESTART, in order not to start dependencies that
+ * are not around. */
+ JobType nt;
+
+ nt = job_type_collapse(is_stop ? JOB_STOP : JOB_TRY_RESTART, dep);
+ if (nt == JOB_NOP)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER), e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ return r;
+
+ sd_bus_error_free(e);
+ }
+ }
+
+ /* Process UNIT_ATOM_PROPAGATE_STOP_GRACEFUL (PropagatesStopTo=) units. We need to wait until
+ * all other dependencies are processed, i.e. we're the anchor job or already in the recursion
+ * that handles it. */
+ if (!by || FLAGS_SET(flags, TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL))
+ UNIT_FOREACH_DEPENDENCY(dep, ret->unit, UNIT_ATOM_PROPAGATE_STOP_GRACEFUL) {
+ JobType nt;
+ Job *j;
+
+ j = hashmap_get(tr->jobs, dep);
+ nt = job_type_propagate_stop_graceful(j);
+
+ if (nt == JOB_NOP)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, nt, dep, ret, TRANSACTION_MATTERS | (flags & TRANSACTION_IGNORE_ORDER) | TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL, e);
+ if (r < 0) {
+ if (r != -EBADR) /* job type not applicable */
+ return r;
+
+ sd_bus_error_free(e);
+ }
+ }
+ }
+
+ if (type == JOB_RELOAD)
+ transaction_add_propagate_reload_jobs(tr, ret->unit, ret, flags & TRANSACTION_IGNORE_ORDER);
+
+ /* JOB_VERIFY_ACTIVE requires no dependency handling */
+
+ return 0;
+
+fail:
+ /* Recursive call failed to add required jobs so let's drop top level job as well. */
+ log_unit_debug_errno(unit, r, "Cannot add dependency job to transaction, deleting job %s/%s again: %s",
+ unit->id, job_type_to_string(type), bus_error_message(e, r));
+
+ transaction_delete_job(tr, ret, /* delete_dependencies= */ false);
+ return r;
+}
+
+static bool shall_stop_on_isolate(Transaction *tr, Unit *u) {
+ assert(tr);
+ assert(u);
+
+ if (u->ignore_on_isolate)
+ return false;
+
+ /* Is there already something listed for this? */
+ if (hashmap_contains(tr->jobs, u))
+ return false;
+
+ return true;
+}
+
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m) {
+ Unit *u;
+ char *k;
+ int r;
+
+ assert(tr);
+ assert(m);
+
+ HASHMAP_FOREACH_KEY(u, k, m->units) {
+ _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+ Unit *o;
+
+ /* Ignore aliases */
+ if (u->id != k)
+ continue;
+
+ /* No need to stop inactive units */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) && !u->job)
+ continue;
+
+ if (!shall_stop_on_isolate(tr, u))
+ continue;
+
+ /* Keep units that are triggered by units we want to keep around. */
+ bool keep = false;
+ UNIT_FOREACH_DEPENDENCY(o, u, UNIT_ATOM_TRIGGERED_BY)
+ if (!shall_stop_on_isolate(tr, o)) {
+ keep = true;
+ break;
+ }
+ if (keep)
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, u, tr->anchor_job, TRANSACTION_MATTERS, &e);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add isolate job, ignoring: %s", bus_error_message(&e, r));
+ }
+
+ return 0;
+}
+
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u) {
+ Unit *trigger;
+ int r;
+
+ assert(tr);
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(trigger, u, UNIT_ATOM_TRIGGERED_BY) {
+ _cleanup_(sd_bus_error_free) sd_bus_error e = SD_BUS_ERROR_NULL;
+
+ /* No need to stop inactive jobs */
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(trigger)) && !trigger->job)
+ continue;
+
+ /* Is there already something listed for this? */
+ if (hashmap_contains(tr->jobs, trigger))
+ continue;
+
+ r = transaction_add_job_and_dependencies(tr, JOB_STOP, trigger, tr->anchor_job, TRANSACTION_MATTERS, &e);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Cannot add triggered by job, ignoring: %s", bus_error_message(&e, r));
+ }
+
+ return 0;
+}
+
+Transaction *transaction_new(bool irreversible) {
+ Transaction *tr;
+
+ tr = new0(Transaction, 1);
+ if (!tr)
+ return NULL;
+
+ tr->jobs = hashmap_new(NULL);
+ if (!tr->jobs)
+ return mfree(tr);
+
+ tr->irreversible = irreversible;
+
+ return tr;
+}
+
+Transaction *transaction_free(Transaction *tr) {
+ if (!tr)
+ return NULL;
+
+ assert(hashmap_isempty(tr->jobs));
+ hashmap_free(tr->jobs);
+
+ return mfree(tr);
+}
+
+Transaction *transaction_abort_and_free(Transaction *tr) {
+ if (!tr)
+ return NULL;
+
+ transaction_abort(tr);
+
+ return transaction_free(tr);
+}
diff --git a/src/core/transaction.h b/src/core/transaction.h
new file mode 100644
index 0000000..151e02d
--- /dev/null
+++ b/src/core/transaction.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct Transaction Transaction;
+
+#include "hashmap.h"
+#include "job.h"
+#include "manager.h"
+#include "unit.h"
+
+struct Transaction {
+ /* Jobs to be added */
+ Hashmap *jobs; /* Unit object => Job object list 1:1 */
+ Job *anchor_job; /* the job the user asked for */
+ bool irreversible;
+};
+
+Transaction *transaction_new(bool irreversible);
+Transaction *transaction_free(Transaction *tr);
+Transaction *transaction_abort_and_free(Transaction *tr);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Transaction*, transaction_abort_and_free);
+
+typedef enum TransactionAddFlags {
+ TRANSACTION_MATTERS = 1 << 0,
+ TRANSACTION_CONFLICTS = 1 << 1,
+ TRANSACTION_IGNORE_REQUIREMENTS = 1 << 2,
+ TRANSACTION_IGNORE_ORDER = 1 << 3,
+
+ /* Propagate a START job to other units like a RESTART */
+ TRANSACTION_PROPAGATE_START_AS_RESTART = 1 << 4,
+
+ /* Indicate that we're in the recursion for processing UNIT_ATOM_PROPAGATE_STOP_GRACEFUL units */
+ TRANSACTION_PROCESS_PROPAGATE_STOP_GRACEFUL = 1 << 5,
+} TransactionAddFlags;
+
+void transaction_add_propagate_reload_jobs(
+ Transaction *tr,
+ Unit *unit, Job *by,
+ TransactionAddFlags flags);
+
+int transaction_add_job_and_dependencies(
+ Transaction *tr,
+ JobType type,
+ Unit *unit,
+ Job *by,
+ TransactionAddFlags flags,
+ sd_bus_error *e);
+
+int transaction_activate(Transaction *tr, Manager *m, JobMode mode, Set *affected, sd_bus_error *e);
+int transaction_add_isolate_jobs(Transaction *tr, Manager *m);
+int transaction_add_triggering_jobs(Transaction *tr, Unit *u);
diff --git a/src/core/unit-dependency-atom.c b/src/core/unit-dependency-atom.c
new file mode 100644
index 0000000..35b279b
--- /dev/null
+++ b/src/core/unit-dependency-atom.c
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "unit-dependency-atom.h"
+
+static const UnitDependencyAtom atom_map[_UNIT_DEPENDENCY_MAX] = {
+ /* A table that maps high-level dependency types to low-level dependency "atoms". The latter actually
+ * describe specific facets of dependency behaviour. The former combine them into one user-facing
+ * concept. Atoms are a bit mask, though a bunch of dependency types have only a single bit set.
+ *
+ * Typically when the user configures a dependency they go via dependency type, but when we act on
+ * them we go by atom.
+ *
+ * NB: when you add a new dependency type here, make sure to also add one to the (best-effort)
+ * reverse table in unit_dependency_from_unique_atom() further down. */
+
+ [UNIT_REQUIRES] = UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_REQUISITE] = UNIT_ATOM_PULL_IN_VERIFY |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_WANTS] = UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_FAIL |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_BINDS_TO] = UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_PART_OF] = UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_UPHOLDS] = UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE,
+
+ [UNIT_REQUIRED_BY] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_REQUISITE_OF] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_WANTED_BY] = UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+ [UNIT_BOUND_BY] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES,
+
+ [UNIT_UPHELD_BY] = UNIT_ATOM_START_STEADILY |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED,
+
+ [UNIT_CONSISTS_OF] = UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART,
+
+ [UNIT_CONFLICTS] = UNIT_ATOM_PULL_IN_STOP |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START,
+
+ [UNIT_CONFLICTED_BY] = UNIT_ATOM_PULL_IN_STOP_IGNORED |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE,
+
+ [UNIT_PROPAGATES_STOP_TO] = UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP_GRACEFUL,
+
+ /* These are simple dependency types: they consist of a single atom only */
+ [UNIT_ON_FAILURE] = UNIT_ATOM_ON_FAILURE,
+ [UNIT_ON_SUCCESS] = UNIT_ATOM_ON_SUCCESS,
+ [UNIT_ON_FAILURE_OF] = UNIT_ATOM_ON_FAILURE_OF,
+ [UNIT_ON_SUCCESS_OF] = UNIT_ATOM_ON_SUCCESS_OF,
+ [UNIT_BEFORE] = UNIT_ATOM_BEFORE,
+ [UNIT_AFTER] = UNIT_ATOM_AFTER,
+ [UNIT_TRIGGERS] = UNIT_ATOM_TRIGGERS,
+ [UNIT_TRIGGERED_BY] = UNIT_ATOM_TRIGGERED_BY,
+ [UNIT_PROPAGATES_RELOAD_TO] = UNIT_ATOM_PROPAGATES_RELOAD_TO,
+ [UNIT_JOINS_NAMESPACE_OF] = UNIT_ATOM_JOINS_NAMESPACE_OF,
+ [UNIT_REFERENCES] = UNIT_ATOM_REFERENCES,
+ [UNIT_REFERENCED_BY] = UNIT_ATOM_REFERENCED_BY,
+ [UNIT_IN_SLICE] = UNIT_ATOM_IN_SLICE,
+ [UNIT_SLICE_OF] = UNIT_ATOM_SLICE_OF,
+
+ /* These are dependency types without effect on our state engine. We maintain them only to make
+ * things discoverable/debuggable as they are the inverse dependencies to some of the above. As they
+ * have no effect of their own, they all map to no atoms at all, i.e. the value 0. */
+ [UNIT_RELOAD_PROPAGATED_FROM] = 0,
+ [UNIT_STOP_PROPAGATED_FROM] = 0,
+};
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d) {
+ if (d < 0)
+ return _UNIT_DEPENDENCY_ATOM_INVALID;
+
+ assert(d < _UNIT_DEPENDENCY_MAX);
+
+ return atom_map[d];
+}
+
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom) {
+
+ /* This is a "best-effort" function that maps the specified 'atom' mask to a dependency type that is
+ * is equal to or has a superset of bits set if that's uniquely possible. The idea is that this
+ * function is used when iterating through deps that have a specific atom: if there's exactly one
+ * dependency type of the specific atom we don't need iterate through all deps a unit has, but can
+ * pinpoint things directly.
+ *
+ * This function will return _UNIT_DEPENDENCY_INVALID in case the specified value is not known or not
+ * uniquely defined, i.e. there are multiple dependencies with the atom or the combination set. */
+
+ switch ((int64_t) atom) {
+
+ /* Note that we can't list UNIT_REQUIRES here since it's a true subset of UNIT_BINDS_TO, and
+ * hence its atom bits not uniquely mappable. */
+
+ case UNIT_ATOM_PULL_IN_VERIFY |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_PULL_IN_VERIFY: /* a single dep type uses this atom */
+ return UNIT_REQUISITE;
+
+ case UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_FAIL |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_RETROACTIVE_START_FAIL:
+ return UNIT_WANTS;
+
+ case UNIT_ATOM_PULL_IN_START |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT:
+ return UNIT_BINDS_TO;
+
+ case UNIT_ATOM_PULL_IN_START_IGNORED |
+ UNIT_ATOM_RETROACTIVE_START_REPLACE |
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE |
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE |
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE:
+ case UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE:
+ return UNIT_UPHOLDS;
+
+ case UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+ case UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE:
+ return UNIT_REQUISITE_OF;
+
+ case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP |
+ UNIT_ATOM_PROPAGATE_RESTART |
+ UNIT_ATOM_PROPAGATE_START_FAILURE |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED |
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES:
+ case UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE:
+ return UNIT_BOUND_BY;
+
+ case UNIT_ATOM_START_STEADILY |
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES |
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED:
+ case UNIT_ATOM_START_STEADILY:
+ return UNIT_UPHELD_BY;
+
+ case UNIT_ATOM_PULL_IN_STOP |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START:
+ case UNIT_ATOM_PULL_IN_STOP:
+ return UNIT_CONFLICTS;
+
+ case UNIT_ATOM_PULL_IN_STOP_IGNORED |
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START |
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+ case UNIT_ATOM_PULL_IN_STOP_IGNORED:
+ case UNIT_ATOM_PROPAGATE_STOP_FAILURE:
+ return UNIT_CONFLICTED_BY;
+
+ case UNIT_ATOM_RETROACTIVE_STOP_ON_STOP |
+ UNIT_ATOM_PROPAGATE_STOP_GRACEFUL:
+ case UNIT_ATOM_PROPAGATE_STOP_GRACEFUL:
+ return UNIT_PROPAGATES_STOP_TO;
+
+ /* And now, the simple ones */
+
+ case UNIT_ATOM_ON_FAILURE:
+ return UNIT_ON_FAILURE;
+
+ case UNIT_ATOM_ON_SUCCESS:
+ return UNIT_ON_SUCCESS;
+
+ case UNIT_ATOM_ON_SUCCESS_OF:
+ return UNIT_ON_SUCCESS_OF;
+
+ case UNIT_ATOM_ON_FAILURE_OF:
+ return UNIT_ON_FAILURE_OF;
+
+ case UNIT_ATOM_BEFORE:
+ return UNIT_BEFORE;
+
+ case UNIT_ATOM_AFTER:
+ return UNIT_AFTER;
+
+ case UNIT_ATOM_TRIGGERS:
+ return UNIT_TRIGGERS;
+
+ case UNIT_ATOM_TRIGGERED_BY:
+ return UNIT_TRIGGERED_BY;
+
+ case UNIT_ATOM_PROPAGATES_RELOAD_TO:
+ return UNIT_PROPAGATES_RELOAD_TO;
+
+ case UNIT_ATOM_JOINS_NAMESPACE_OF:
+ return UNIT_JOINS_NAMESPACE_OF;
+
+ case UNIT_ATOM_REFERENCES:
+ return UNIT_REFERENCES;
+
+ case UNIT_ATOM_REFERENCED_BY:
+ return UNIT_REFERENCED_BY;
+
+ case UNIT_ATOM_IN_SLICE:
+ return UNIT_IN_SLICE;
+
+ case UNIT_ATOM_SLICE_OF:
+ return UNIT_SLICE_OF;
+
+ default:
+ return _UNIT_DEPENDENCY_INVALID;
+ }
+}
diff --git a/src/core/unit-dependency-atom.h b/src/core/unit-dependency-atom.h
new file mode 100644
index 0000000..96f00ca
--- /dev/null
+++ b/src/core/unit-dependency-atom.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+
+#include "unit-def.h"
+
+/* Flags that identify the various "atomic" behaviours a specific dependency type implies. Each dependency is
+ * a combination of one or more of these flags that define what they actually entail. */
+typedef enum UnitDependencyAtom {
+
+ /* This unit pulls in the other unit as JOB_START job into the transaction, and if that doesn't work
+ * the transaction fails. */
+ UNIT_ATOM_PULL_IN_START = UINT64_C(1) << 0,
+ /* Similar, but if it doesn't work, ignore. */
+ UNIT_ATOM_PULL_IN_START_IGNORED = UINT64_C(1) << 1,
+ /* Pull in a JOB_VERIFY job into the transaction, i.e. pull in JOB_VERIFY rather than
+ * JOB_START. i.e. check the unit is started but don't pull it in. */
+ UNIT_ATOM_PULL_IN_VERIFY = UINT64_C(1) << 2,
+
+ /* Pull in a JOB_STOP job for the other job into transactions, and fail if that doesn't work. */
+ UNIT_ATOM_PULL_IN_STOP = UINT64_C(1) << 3,
+ /* Same, but don't fail, ignore it. */
+ UNIT_ATOM_PULL_IN_STOP_IGNORED = UINT64_C(1) << 4,
+
+ /* If our enters inactive state, add the other unit to the StopWhenUneeded= queue */
+ UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE = UINT64_C(1) << 5,
+ /* Pin the other unit i.e. ensure StopWhenUneeded= won't trigger for the other unit as long as we are
+ * not in inactive state */
+ UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED = UINT64_C(1) << 6,
+
+ /* Stop our unit if the other unit happens to inactive */
+ UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT = UINT64_C(1) << 7,
+ /* If our unit enters inactive state, add the other unit to the BoundBy= queue */
+ UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE = UINT64_C(1) << 8,
+
+ /* Start this unit whenever we find it inactive and the other unit active */
+ UNIT_ATOM_START_STEADILY = UINT64_C(1) << 9,
+ /* Whenever our unit becomes active, add other unit to start_when_upheld_queue */
+ UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE = UINT64_C(1) << 10,
+
+ /* If our unit unexpectedly becomes active, retroactively start the other unit too, in "replace" job
+ * mode */
+ UNIT_ATOM_RETROACTIVE_START_REPLACE = UINT64_C(1) << 11,
+ /* Similar, but in "fail" job mode */
+ UNIT_ATOM_RETROACTIVE_START_FAIL = UINT64_C(1) << 12,
+ /* If our unit unexpectedly becomes active, retroactively stop the other unit too */
+ UNIT_ATOM_RETROACTIVE_STOP_ON_START = UINT64_C(1) << 13,
+ /* If our unit unexpectedly becomes inactive, retroactively stop the other unit too */
+ UNIT_ATOM_RETROACTIVE_STOP_ON_STOP = UINT64_C(1) << 14,
+
+ /* If a start job for this unit fails, propagate the failure to start job of other unit too */
+ UNIT_ATOM_PROPAGATE_START_FAILURE = UINT64_C(1) << 15,
+ /* If a stop job for this unit fails, propagate the failure to any stop job of the other unit too */
+ UNIT_ATOM_PROPAGATE_STOP_FAILURE = UINT64_C(1) << 16,
+ /* If our start job succeeded but the unit is inactive then (think: oneshot units), propagate this as
+ * failure to the other unit. */
+ UNIT_ATOM_PROPAGATE_INACTIVE_START_AS_FAILURE = UINT64_C(1) << 17,
+ /* When putting together a transaction, propagate JOB_STOP from our unit to the other. */
+ UNIT_ATOM_PROPAGATE_STOP = UINT64_C(1) << 18,
+ /* Like UNIT_ATOM_PROPAGATE_STOP, but enqueues a restart job if there's already a start job (avoids
+ * job type conflict). */
+ UNIT_ATOM_PROPAGATE_STOP_GRACEFUL = UINT64_C(1) << 19,
+ /* When putting together a transaction, propagate JOB_RESTART from our unit to the other. */
+ UNIT_ATOM_PROPAGATE_RESTART = UINT64_C(1) << 20,
+
+ /* Add the other unit to the default target dependency queue */
+ UNIT_ATOM_ADD_DEFAULT_TARGET_DEPENDENCY_QUEUE = UINT64_C(1) << 21,
+ /* Recheck default target deps on other units (which are target units) */
+ UNIT_ATOM_DEFAULT_TARGET_DEPENDENCIES = UINT64_C(1) << 22,
+
+ /* The remaining atoms map 1:1 to the equally named high-level deps */
+ UNIT_ATOM_ON_FAILURE = UINT64_C(1) << 23,
+ UNIT_ATOM_ON_SUCCESS = UINT64_C(1) << 24,
+ UNIT_ATOM_ON_FAILURE_OF = UINT64_C(1) << 25,
+ UNIT_ATOM_ON_SUCCESS_OF = UINT64_C(1) << 26,
+ UNIT_ATOM_BEFORE = UINT64_C(1) << 27,
+ UNIT_ATOM_AFTER = UINT64_C(1) << 28,
+ UNIT_ATOM_TRIGGERS = UINT64_C(1) << 29,
+ UNIT_ATOM_TRIGGERED_BY = UINT64_C(1) << 30,
+ UNIT_ATOM_PROPAGATES_RELOAD_TO = UINT64_C(1) << 31,
+ UNIT_ATOM_JOINS_NAMESPACE_OF = UINT64_C(1) << 32,
+ UNIT_ATOM_REFERENCES = UINT64_C(1) << 33,
+ UNIT_ATOM_REFERENCED_BY = UINT64_C(1) << 34,
+ UNIT_ATOM_IN_SLICE = UINT64_C(1) << 35,
+ UNIT_ATOM_SLICE_OF = UINT64_C(1) << 36,
+ _UNIT_DEPENDENCY_ATOM_MAX = (UINT64_C(1) << 37) - 1,
+ _UNIT_DEPENDENCY_ATOM_INVALID = -EINVAL,
+} UnitDependencyAtom;
+
+UnitDependencyAtom unit_dependency_to_atom(UnitDependency d);
+UnitDependency unit_dependency_from_unique_atom(UnitDependencyAtom atom);
diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c
new file mode 100644
index 0000000..9f95984
--- /dev/null
+++ b/src/core/unit-printf.c
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "format-util.h"
+#include "macro.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+#include "unit-printf.h"
+#include "unit.h"
+#include "user-util.h"
+
+static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_to_prefix_and_instance(u->id, ret);
+}
+
+static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_to_prefix(u->id, ret);
+}
+
+static int specifier_prefix_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *p = NULL;
+ const Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ r = unit_name_to_prefix(u->id, &p);
+ if (r < 0)
+ return r;
+
+ return unit_name_unescape(p, ret);
+}
+
+static int specifier_instance_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ return unit_name_unescape(strempty(u->instance), ret);
+}
+
+static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ _cleanup_free_ char *prefix = NULL;
+ char *dash;
+ int r;
+
+ r = unit_name_to_prefix(u->id, &prefix);
+ if (r < 0)
+ return r;
+
+ dash = strrchr(prefix, '-');
+ if (dash)
+ return specifier_string(specifier, dash + 1, root, userdata, ret);
+
+ *ret = TAKE_PTR(prefix);
+ return 0;
+}
+
+static int specifier_last_component_unescaped(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ r = specifier_last_component(specifier, data, root, userdata, &p);
+ if (r < 0)
+ return r;
+
+ return unit_name_unescape(p, ret);
+}
+
+static int specifier_filename(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ if (u->instance)
+ return unit_name_path_unescape(u->instance, ret);
+ else
+ return unit_name_to_path(u->id, ret);
+}
+
+static void bad_specifier(const Unit *u, char specifier) {
+ log_unit_warning(u, "Specifier '%%%c' used in unit configuration, which is deprecated. Please update your unit file, as it does not work as intended.", specifier);
+}
+
+static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+
+ bad_specifier(u, specifier);
+
+ if (u->cgroup_path) {
+ char *n;
+
+ n = strdup(u->cgroup_path);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+ }
+
+ return unit_default_cgroup_path(u, ret);
+}
+
+static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *n;
+
+ bad_specifier(u, specifier);
+
+ n = strdup(u->manager->cgroup_root);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_cgroup_slice(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata), *slice;
+ char *n;
+
+ bad_specifier(u, specifier);
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice) {
+ if (slice->cgroup_path)
+ n = strdup(slice->cgroup_path);
+ else
+ return unit_default_cgroup_path(slice, ret);
+ } else
+ n = strdup(u->manager->cgroup_root);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_special_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *n;
+
+ n = strdup(u->manager->prefix[PTR_TO_UINT(data)]);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+static int specifier_credentials_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const Unit *u = ASSERT_PTR(userdata);
+ char *d;
+
+ assert(ret);
+
+ d = strjoin(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
+ if (!d)
+ return -ENOMEM;
+
+ *ret = d;
+ return 0;
+}
+
+int unit_name_printf(const Unit *u, const char* format, char **ret) {
+ /*
+ * This will use the passed string as format string and replace the following specifiers (which should all be
+ * safe for inclusion in unit names):
+ *
+ * %n: the full id of the unit (foo-aaa@bar.waldo)
+ * %N: the id of the unit without the suffix (foo-aaa@bar)
+ * %p: the prefix (foo-aaa)
+ * %i: the instance (bar)
+ * %j: the last component of the prefix (aaa)
+ */
+
+ const Specifier table[] = {
+ { 'i', specifier_string, u->instance },
+ { 'j', specifier_last_component, NULL },
+ { 'n', specifier_string, u->id },
+ { 'N', specifier_prefix_and_instance, NULL },
+ { 'p', specifier_prefix, NULL },
+
+ COMMON_SYSTEM_SPECIFIERS,
+
+ COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope),
+ {}
+ };
+
+ assert(u);
+ assert(format);
+ assert(ret);
+
+ return specifier_printf(format, UNIT_NAME_MAX, table, NULL, u, ret);
+}
+
+int unit_full_printf_full(const Unit *u, const char *format, size_t max_length, char **ret) {
+ /* This is similar to unit_name_printf() but also supports unescaping. Also, adds a couple of
+ * additional codes (which are likely not suitable for unescaped inclusion in unit names):
+ *
+ * %f: the unescaped instance if set, otherwise the id unescaped as path
+ *
+ * %c: cgroup path of unit (deprecated)
+ * %r: where units in this slice are placed in the cgroup tree (deprecated)
+ * %R: the root of this systemd's instance tree (deprecated)
+ *
+ * %C: the cache directory root (e.g. /var/cache or $XDG_CACHE_HOME)
+ * %d: the credentials directory ($CREDENTIALS_DIRECTORY)
+ * %E: the configuration directory root (e.g. /etc or $XDG_CONFIG_HOME)
+ * %L: the log directory root (e.g. /var/log or $XDG_STATE_HOME/log)
+ * %S: the state directory root (e.g. /var/lib or $XDG_STATE_HOME)
+ * %t: the runtime directory root (e.g. /run or $XDG_RUNTIME_DIR)
+ *
+ * %h: the homedir of the running user
+ * %s: the shell of the running user
+ *
+ * NOTICE: When you add new entries here, please be careful: specifiers which depend on settings of
+ * the unit file itself are broken by design, as they would resolve differently depending on whether
+ * they are used before or after the relevant configuration setting. Hence: don't add them.
+ */
+
+ assert(u);
+ assert(format);
+ assert(ret);
+
+ const Specifier table[] = {
+ { 'i', specifier_string, u->instance },
+ { 'I', specifier_instance_unescaped, NULL },
+ { 'j', specifier_last_component, NULL },
+ { 'J', specifier_last_component_unescaped, NULL },
+ { 'n', specifier_string, u->id },
+ { 'N', specifier_prefix_and_instance, NULL },
+ { 'p', specifier_prefix, NULL },
+ { 'P', specifier_prefix_unescaped, NULL },
+
+ { 'f', specifier_filename, NULL },
+ { 'y', specifier_real_path, u->fragment_path },
+ { 'Y', specifier_real_directory, u->fragment_path },
+
+ { 'c', specifier_cgroup, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+ { 'r', specifier_cgroup_slice, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+ { 'R', specifier_cgroup_root, NULL }, /* deprecated, see 1b89b0c499cd4bf0ff389caab4ecaae6e75f9d4e */
+
+ { 'C', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CACHE) },
+ { 'd', specifier_credentials_dir, NULL },
+ { 'E', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_CONFIGURATION) },
+ { 'L', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_LOGS) },
+ { 'S', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_STATE) },
+ { 't', specifier_special_directory, UINT_TO_PTR(EXEC_DIRECTORY_RUNTIME) },
+
+ { 'h', specifier_user_home, NULL },
+ { 's', specifier_user_shell, NULL },
+
+ COMMON_SYSTEM_SPECIFIERS,
+
+ COMMON_CREDS_SPECIFIERS(u->manager->runtime_scope),
+
+ COMMON_TMP_SPECIFIERS,
+ {}
+ };
+
+ return specifier_printf(format, max_length, table, NULL, u, ret);
+}
diff --git a/src/core/unit-printf.h b/src/core/unit-printf.h
new file mode 100644
index 0000000..2df07db
--- /dev/null
+++ b/src/core/unit-printf.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "creds-util.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "unit.h"
+
+int unit_name_printf(const Unit *u, const char* text, char **ret);
+int unit_full_printf_full(const Unit *u, const char *text, size_t max_length, char **ret);
+static inline int unit_full_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, LONG_LINE_MAX, ret);
+}
+static inline int unit_path_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, PATH_MAX-1, ret);
+}
+static inline int unit_fd_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, FDNAME_MAX, ret);
+}
+static inline int unit_cred_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, CREDENTIAL_NAME_MAX, ret);
+}
+static inline int unit_env_printf(const Unit *u, const char *text, char **ret) {
+ return unit_full_printf_full(u, text, sc_arg_max(), ret);
+}
diff --git a/src/core/unit-serialize.c b/src/core/unit-serialize.c
new file mode 100644
index 0000000..fe4221c
--- /dev/null
+++ b/src/core/unit-serialize.c
@@ -0,0 +1,890 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-socket-bind.h"
+#include "bus-util.h"
+#include "dbus.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "parse-util.h"
+#include "restrict-ifaces.h"
+#include "serialize.h"
+#include "string-table.h"
+#include "unit-serialize.h"
+#include "user-util.h"
+
+static int serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (mask == 0)
+ return 0;
+
+ r = cg_mask_to_string(mask, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to format cgroup mask: %m");
+
+ return serialize_item(f, key, s);
+}
+
+/* Make sure out values fit in the bitfield. */
+assert_cc(_UNIT_MARKER_MAX <= sizeof(((Unit){}).markers) * 8);
+
+static int serialize_markers(FILE *f, unsigned markers) {
+ assert(f);
+
+ if (markers == 0)
+ return 0;
+
+ fputs("markers=", f);
+ for (UnitMarker m = 0; m < _UNIT_MARKER_MAX; m++)
+ if (FLAGS_SET(markers, 1u << m))
+ fputs(unit_marker_to_string(m), f);
+ fputc('\n', f);
+ return 0;
+}
+
+static int deserialize_markers(Unit *u, const char *value) {
+ assert(u);
+ assert(value);
+ int r;
+
+ for (const char *p = value;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r <= 0)
+ return r;
+
+ UnitMarker m = unit_marker_from_string(word);
+ if (m < 0) {
+ log_unit_debug_errno(u, m, "Unknown unit marker \"%s\", ignoring.", word);
+ continue;
+ }
+
+ u->markers |= 1u << m;
+ }
+}
+
+static const char* const ip_accounting_metric_field_table[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes",
+ [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets",
+ [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes",
+ [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(ip_accounting_metric_field, CGroupIPAccountingMetric);
+
+static const char* const io_accounting_metric_field_base_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-base",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-base",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-base",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-base",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_base, CGroupIOAccountingMetric);
+
+static const char* const io_accounting_metric_field_last_table[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "io-accounting-read-bytes-last",
+ [CGROUP_IO_WRITE_BYTES] = "io-accounting-write-bytes-last",
+ [CGROUP_IO_READ_OPERATIONS] = "io-accounting-read-operations-last",
+ [CGROUP_IO_WRITE_OPERATIONS] = "io-accounting-write-operations-last",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(io_accounting_metric_field_last, CGroupIOAccountingMetric);
+
+static const char* const memory_accounting_metric_field_last_table[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1] = {
+ [CGROUP_MEMORY_PEAK] = "memory-accounting-peak",
+ [CGROUP_MEMORY_SWAP_PEAK] = "memory-accounting-swap-peak",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP(memory_accounting_metric_field_last, CGroupMemoryAccountingMetric);
+
+int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool switching_root) {
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ if (switching_root && UNIT_VTABLE(u)->exclude_from_switch_root_serialization) {
+ /* In the new root, paths for mounts and automounts will be different, so it doesn't make
+ * much sense to serialize things. API file systems will be moved to the new root, but we
+ * don't have mount units for those. */
+ log_unit_debug(u, "not serializing before switch-root");
+ return 0;
+ }
+
+ /* Start marker */
+ fputs(u->id, f);
+ fputc('\n', f);
+
+ assert(!!UNIT_VTABLE(u)->serialize == !!UNIT_VTABLE(u)->deserialize_item);
+
+ if (UNIT_VTABLE(u)->serialize) {
+ r = UNIT_VTABLE(u)->serialize(u, f, fds);
+ if (r < 0)
+ return r;
+ }
+
+ (void) serialize_dual_timestamp(f, "state-change-timestamp", &u->state_change_timestamp);
+
+ (void) serialize_dual_timestamp(f, "inactive-exit-timestamp", &u->inactive_exit_timestamp);
+ (void) serialize_dual_timestamp(f, "active-enter-timestamp", &u->active_enter_timestamp);
+ (void) serialize_dual_timestamp(f, "active-exit-timestamp", &u->active_exit_timestamp);
+ (void) serialize_dual_timestamp(f, "inactive-enter-timestamp", &u->inactive_enter_timestamp);
+
+ (void) serialize_dual_timestamp(f, "condition-timestamp", &u->condition_timestamp);
+ (void) serialize_dual_timestamp(f, "assert-timestamp", &u->assert_timestamp);
+
+ (void) serialize_ratelimit(f, "start-ratelimit", &u->start_ratelimit);
+ (void) serialize_ratelimit(f, "auto-start-stop-ratelimit", &u->auto_start_stop_ratelimit);
+
+ if (dual_timestamp_is_set(&u->condition_timestamp))
+ (void) serialize_bool(f, "condition-result", u->condition_result);
+
+ if (dual_timestamp_is_set(&u->assert_timestamp))
+ (void) serialize_bool(f, "assert-result", u->assert_result);
+
+ (void) serialize_bool(f, "transient", u->transient);
+ (void) serialize_bool(f, "in-audit", u->in_audit);
+
+ (void) serialize_bool(f, "exported-invocation-id", u->exported_invocation_id);
+ (void) serialize_bool(f, "exported-log-level-max", u->exported_log_level_max);
+ (void) serialize_bool(f, "exported-log-extra-fields", u->exported_log_extra_fields);
+ (void) serialize_bool(f, "exported-log-rate-limit-interval", u->exported_log_ratelimit_interval);
+ (void) serialize_bool(f, "exported-log-rate-limit-burst", u->exported_log_ratelimit_burst);
+
+ (void) serialize_item_format(f, "cpu-usage-base", "%" PRIu64, u->cpu_usage_base);
+ if (u->cpu_usage_last != NSEC_INFINITY)
+ (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
+
+ if (u->managed_oom_kill_last > 0)
+ (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last);
+
+ if (u->oom_kill_last > 0)
+ (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
+
+ for (CGroupIOAccountingMetric im = 0; im < _CGROUP_IO_ACCOUNTING_METRIC_MAX; im++) {
+ (void) serialize_item_format(f, io_accounting_metric_field_base_to_string(im), "%" PRIu64, u->io_accounting_base[im]);
+
+ if (u->io_accounting_last[im] != UINT64_MAX)
+ (void) serialize_item_format(f, io_accounting_metric_field_last_to_string(im), "%" PRIu64, u->io_accounting_last[im]);
+ }
+
+ for (CGroupMemoryAccountingMetric metric = 0; metric <= _CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST; metric++) {
+ uint64_t v;
+
+ r = unit_get_memory_accounting(u, metric, &v);
+ if (r >= 0)
+ (void) serialize_item_format(f, memory_accounting_metric_field_last_to_string(metric), "%" PRIu64, v);
+ }
+
+ if (u->cgroup_path)
+ (void) serialize_item(f, "cgroup", u->cgroup_path);
+
+ (void) serialize_bool(f, "cgroup-realized", u->cgroup_realized);
+ (void) serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask);
+ (void) serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask);
+
+ (void) bpf_serialize_socket_bind(u, f, fds);
+
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-ingress-installed", u->ip_bpf_ingress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "ip-bpf-egress-installed", u->ip_bpf_egress_installed);
+ (void) bpf_program_serialize_attachment(f, fds, "bpf-device-control-installed", u->bpf_device_control_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-ingress-installed", u->ip_bpf_custom_ingress_installed);
+ (void) bpf_program_serialize_attachment_set(f, fds, "ip-bpf-custom-egress-installed", u->ip_bpf_custom_egress_installed);
+
+ (void) serialize_restrict_network_interfaces(u, f, fds);
+
+ if (uid_is_valid(u->ref_uid))
+ (void) serialize_item_format(f, "ref-uid", UID_FMT, u->ref_uid);
+ if (gid_is_valid(u->ref_gid))
+ (void) serialize_item_format(f, "ref-gid", GID_FMT, u->ref_gid);
+
+ if (!sd_id128_is_null(u->invocation_id))
+ (void) serialize_item_format(f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+ (void) serialize_item_format(f, "freezer-state", "%s", freezer_state_to_string(unit_freezer_state(u)));
+ (void) serialize_markers(f, u->markers);
+
+ bus_track_serialize(u->bus_track, f, "ref");
+
+ for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+ uint64_t v;
+
+ r = unit_get_ip_accounting(u, m, &v);
+ if (r >= 0)
+ (void) serialize_item_format(f, ip_accounting_metric_field_to_string(m), "%" PRIu64, v);
+ }
+
+ if (!switching_root) {
+ if (u->job) {
+ fputs("job\n", f);
+ job_serialize(u->job, f);
+ }
+
+ if (u->nop_job) {
+ fputs("job\n", f);
+ job_serialize(u->nop_job, f);
+ }
+ }
+
+ /* End marker */
+ fputc('\n', f);
+ return 0;
+}
+
+static int unit_deserialize_job(Unit *u, FILE *f) {
+ _cleanup_(job_freep) Job *j = NULL;
+ int r;
+
+ assert(u);
+ assert(f);
+
+ j = job_new_raw(u);
+ if (!j)
+ return log_oom();
+
+ r = job_deserialize(j, f);
+ if (r < 0)
+ return r;
+
+ r = job_install_deserialized(j);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(j);
+ return 0;
+}
+
+#define MATCH_DESERIALIZE(key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ int _deserialize_r = parse_func(v); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring.", l, v); \
+ else \
+ target = _deserialize_r; \
+ }; \
+ _deserialize_matched; \
+ })
+
+#define MATCH_DESERIALIZE_IMMEDIATE(key, l, v, parse_func, target) \
+ ({ \
+ bool _deserialize_matched = streq(l, key); \
+ if (_deserialize_matched) { \
+ int _deserialize_r = parse_func(v, &target); \
+ if (_deserialize_r < 0) \
+ log_unit_debug_errno(u, _deserialize_r, \
+ "Failed to parse \"%s=%s\", ignoring", l, v); \
+ }; \
+ _deserialize_matched; \
+ })
+
+int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds) {
+ int r;
+
+ assert(u);
+ assert(f);
+ assert(fds);
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ ssize_t m;
+ size_t k;
+ char *v;
+
+ r = deserialize_read_line(f, &l);
+ if (r < 0)
+ return r;
+ if (r == 0) /* eof or end marker */
+ break;
+
+ k = strcspn(l, "=");
+
+ if (l[k] == '=') {
+ l[k] = 0;
+ v = l+k+1;
+ } else
+ v = l+k;
+
+ if (streq(l, "job")) {
+ if (v[0] == '\0') {
+ /* New-style serialized job */
+ r = unit_deserialize_job(u, f);
+ if (r < 0)
+ return r;
+ } else /* Legacy for pre-44 */
+ log_unit_warning(u, "Update from too old systemd versions are unsupported, cannot deserialize job: %s", v);
+ continue;
+ } else if (streq(l, "state-change-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->state_change_timestamp);
+ continue;
+ } else if (streq(l, "inactive-exit-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->inactive_exit_timestamp);
+ continue;
+ } else if (streq(l, "active-enter-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->active_enter_timestamp);
+ continue;
+ } else if (streq(l, "active-exit-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->active_exit_timestamp);
+ continue;
+ } else if (streq(l, "inactive-enter-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->inactive_enter_timestamp);
+ continue;
+ } else if (streq(l, "condition-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->condition_timestamp);
+ continue;
+ } else if (streq(l, "assert-timestamp")) {
+ (void) deserialize_dual_timestamp(v, &u->assert_timestamp);
+ continue;
+
+ } else if (streq(l, "start-ratelimit")) {
+ deserialize_ratelimit(&u->start_ratelimit, l, v);
+ continue;
+ } else if (streq(l, "auto-start-stop-ratelimit")) {
+ deserialize_ratelimit(&u->auto_start_stop_ratelimit, l, v);
+ continue;
+
+ } else if (MATCH_DESERIALIZE("condition-result", l, v, parse_boolean, u->condition_result))
+ continue;
+
+ else if (MATCH_DESERIALIZE("assert-result", l, v, parse_boolean, u->assert_result))
+ continue;
+
+ else if (MATCH_DESERIALIZE("transient", l, v, parse_boolean, u->transient))
+ continue;
+
+ else if (MATCH_DESERIALIZE("in-audit", l, v, parse_boolean, u->in_audit))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-invocation-id", l, v, parse_boolean, u->exported_invocation_id))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-level-max", l, v, parse_boolean, u->exported_log_level_max))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-extra-fields", l, v, parse_boolean, u->exported_log_extra_fields))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-rate-limit-interval", l, v, parse_boolean, u->exported_log_ratelimit_interval))
+ continue;
+
+ else if (MATCH_DESERIALIZE("exported-log-rate-limit-burst", l, v, parse_boolean, u->exported_log_ratelimit_burst))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-base", l, v, safe_atou64, u->cpu_usage_base) ||
+ MATCH_DESERIALIZE_IMMEDIATE("cpuacct-usage-base", l, v, safe_atou64, u->cpu_usage_base))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cpu-usage-last", l, v, safe_atou64, u->cpu_usage_last))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("managed-oom-kill-last", l, v, safe_atou64, u->managed_oom_kill_last))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("oom-kill-last", l, v, safe_atou64, u->oom_kill_last))
+ continue;
+
+ else if (streq(l, "cgroup")) {
+ r = unit_set_cgroup_path(u, v);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
+
+ (void) unit_watch_cgroup(u);
+ (void) unit_watch_cgroup_memory(u);
+
+ continue;
+
+ } else if (MATCH_DESERIALIZE("cgroup-realized", l, v, parse_boolean, u->cgroup_realized))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-realized-mask", l, v, cg_mask_from_string, u->cgroup_realized_mask))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-enabled-mask", l, v, cg_mask_from_string, u->cgroup_enabled_mask))
+ continue;
+
+ else if (MATCH_DESERIALIZE_IMMEDIATE("cgroup-invalidated-mask", l, v, cg_mask_from_string, u->cgroup_invalidated_mask))
+ continue;
+
+ else if (STR_IN_SET(l, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
+ int fd;
+
+ fd = deserialize_fd(fds, v);
+ if (fd >= 0)
+ (void) bpf_socket_bind_add_initial_link_fd(u, fd);
+ continue;
+
+ } else if (streq(l, "ip-bpf-ingress-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_ingress_installed);
+ continue;
+ } else if (streq(l, "ip-bpf-egress-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->ip_bpf_egress_installed);
+ continue;
+ } else if (streq(l, "bpf-device-control-installed")) {
+ (void) bpf_program_deserialize_attachment(v, fds, &u->bpf_device_control_installed);
+ continue;
+
+ } else if (streq(l, "ip-bpf-custom-ingress-installed")) {
+ (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_ingress_installed);
+ continue;
+ } else if (streq(l, "ip-bpf-custom-egress-installed")) {
+ (void) bpf_program_deserialize_attachment_set(v, fds, &u->ip_bpf_custom_egress_installed);
+ continue;
+
+ } else if (streq(l, "restrict-ifaces-bpf-fd")) {
+ int fd;
+
+ fd = deserialize_fd(fds, v);
+ if (fd >= 0)
+ (void) restrict_network_interfaces_add_initial_link_fd(u, fd);
+
+ continue;
+
+ } else if (streq(l, "ref-uid")) {
+ uid_t uid;
+
+ r = parse_uid(v, &uid);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else
+ unit_ref_uid_gid(u, uid, GID_INVALID);
+ continue;
+
+ } else if (streq(l, "ref-gid")) {
+ gid_t gid;
+
+ r = parse_gid(v, &gid);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else
+ unit_ref_uid_gid(u, UID_INVALID, gid);
+ continue;
+
+ } else if (streq(l, "ref")) {
+ r = strv_extend(&u->deserialized_refs, v);
+ if (r < 0)
+ return log_oom();
+ continue;
+
+ } else if (streq(l, "invocation-id")) {
+ sd_id128_t id;
+
+ r = sd_id128_from_string(v, &id);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse \"%s=%s\", ignoring.", l, v);
+ else {
+ r = unit_set_invocation_id(u, id);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m");
+ }
+
+ continue;
+
+ } else if (MATCH_DESERIALIZE("freezer-state", l, v, freezer_state_from_string, u->freezer_state))
+ continue;
+
+ else if (streq(l, "markers")) {
+ r = deserialize_markers(u, v);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to deserialize \"%s=%s\", ignoring: %m", l, v);
+ continue;
+ }
+
+ m = memory_accounting_metric_field_last_from_string(l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse memory accounting last value %s, ignoring.", v);
+ else
+ u->memory_accounting_last[m] = c;
+ continue;
+ }
+
+ /* Check if this is an IP accounting metric serialization field */
+ m = ip_accounting_metric_field_from_string(l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v);
+ else
+ u->ip_accounting_extra[m] = c;
+ continue;
+ }
+
+ m = io_accounting_metric_field_base_from_string(l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting base value %s, ignoring.", v);
+ else
+ u->io_accounting_base[m] = c;
+ continue;
+ }
+
+ m = io_accounting_metric_field_last_from_string(l);
+ if (m >= 0) {
+ uint64_t c;
+
+ r = safe_atou64(v, &c);
+ if (r < 0)
+ log_unit_debug(u, "Failed to parse IO accounting last value %s, ignoring.", v);
+ else
+ u->io_accounting_last[m] = c;
+ continue;
+ }
+
+ r = exec_shared_runtime_deserialize_compat(u, l, v, fds);
+ if (r < 0) {
+ log_unit_warning(u, "Failed to deserialize runtime parameter '%s', ignoring.", l);
+ continue;
+ } else if (r > 0)
+ /* Returns positive if key was handled by the call */
+ continue;
+
+ if (UNIT_VTABLE(u)->deserialize_item) {
+ r = UNIT_VTABLE(u)->deserialize_item(u, l, v, fds);
+ if (r < 0)
+ log_unit_warning(u, "Failed to deserialize unit parameter '%s', ignoring.", l);
+ }
+ }
+
+ /* Versions before 228 did not carry a state change timestamp. In this case, take the current
+ * time. This is useful, so that timeouts based on this timestamp don't trigger too early, and is
+ * in-line with the logic from before 228 where the base for timeouts was not persistent across
+ * reboots. */
+
+ if (!dual_timestamp_is_set(&u->state_change_timestamp))
+ dual_timestamp_now(&u->state_change_timestamp);
+
+ /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings
+ * applied after we are done. For that we invalidate anything already realized, so that we can
+ * realize it again. */
+ if (u->cgroup_realized) {
+ unit_invalidate_cgroup(u, _CGROUP_MASK_ALL);
+ unit_invalidate_cgroup_bpf(u);
+ }
+
+ return 0;
+}
+
+int unit_deserialize_state_skip(FILE *f) {
+ int r;
+
+ assert(f);
+
+ /* Skip serialized data for this unit. We don't know what it is. */
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0)
+ return 0;
+
+ /* End marker */
+ if (isempty(line))
+ return 1;
+ }
+}
+
+static void print_unit_dependency_mask(FILE *f, const char *kind, UnitDependencyMask mask, bool *space) {
+ const struct {
+ UnitDependencyMask mask;
+ const char *name;
+ } table[] = {
+ { UNIT_DEPENDENCY_FILE, "file" },
+ { UNIT_DEPENDENCY_IMPLICIT, "implicit" },
+ { UNIT_DEPENDENCY_DEFAULT, "default" },
+ { UNIT_DEPENDENCY_UDEV, "udev" },
+ { UNIT_DEPENDENCY_PATH, "path" },
+ { UNIT_DEPENDENCY_MOUNT_FILE, "mount-file" },
+ { UNIT_DEPENDENCY_MOUNTINFO, "mountinfo" },
+ { UNIT_DEPENDENCY_PROC_SWAP, "proc-swap" },
+ { UNIT_DEPENDENCY_SLICE_PROPERTY, "slice-property" },
+ };
+
+ assert(f);
+ assert(kind);
+ assert(space);
+
+ for (size_t i = 0; i < ELEMENTSOF(table); i++) {
+
+ if (mask == 0)
+ break;
+
+ if (FLAGS_SET(mask, table[i].mask)) {
+ if (*space)
+ fputc(' ', f);
+ else
+ *space = true;
+
+ fputs(kind, f);
+ fputs("-", f);
+ fputs(table[i].name, f);
+
+ mask &= ~table[i].mask;
+ }
+ }
+
+ assert(mask == 0);
+}
+
+void unit_dump(Unit *u, FILE *f, const char *prefix) {
+ char *t;
+ const char *prefix2;
+ Unit *following;
+ _cleanup_set_free_ Set *following_set = NULL;
+ CGroupMask m;
+ int r;
+
+ assert(u);
+ assert(u->type >= 0);
+
+ prefix = strempty(prefix);
+ prefix2 = strjoina(prefix, "\t");
+
+ fprintf(f,
+ "%s-> Unit %s:\n",
+ prefix, u->id);
+
+ SET_FOREACH(t, u->aliases)
+ fprintf(f, "%s\tAlias: %s\n", prefix, t);
+
+ fprintf(f,
+ "%s\tDescription: %s\n"
+ "%s\tInstance: %s\n"
+ "%s\tUnit Load State: %s\n"
+ "%s\tUnit Active State: %s\n"
+ "%s\tState Change Timestamp: %s\n"
+ "%s\tInactive Exit Timestamp: %s\n"
+ "%s\tActive Enter Timestamp: %s\n"
+ "%s\tActive Exit Timestamp: %s\n"
+ "%s\tInactive Enter Timestamp: %s\n"
+ "%s\tMay GC: %s\n"
+ "%s\tNeed Daemon Reload: %s\n"
+ "%s\tTransient: %s\n"
+ "%s\tPerpetual: %s\n"
+ "%s\tGarbage Collection Mode: %s\n",
+ prefix, unit_description(u),
+ prefix, strna(u->instance),
+ prefix, unit_load_state_to_string(u->load_state),
+ prefix, unit_active_state_to_string(unit_active_state(u)),
+ prefix, strna(FORMAT_TIMESTAMP(u->state_change_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->inactive_exit_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->active_enter_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->active_exit_timestamp.realtime)),
+ prefix, strna(FORMAT_TIMESTAMP(u->inactive_enter_timestamp.realtime)),
+ prefix, yes_no(unit_may_gc(u)),
+ prefix, yes_no(unit_need_daemon_reload(u)),
+ prefix, yes_no(u->transient),
+ prefix, yes_no(u->perpetual),
+ prefix, collect_mode_to_string(u->collect_mode));
+
+ if (u->markers != 0) {
+ fprintf(f, "%s\tMarkers:", prefix);
+
+ for (UnitMarker marker = 0; marker < _UNIT_MARKER_MAX; marker++)
+ if (FLAGS_SET(u->markers, 1u << marker))
+ fprintf(f, " %s", unit_marker_to_string(marker));
+ fputs("\n", f);
+ }
+
+ if (UNIT_HAS_CGROUP_CONTEXT(u)) {
+ fprintf(f,
+ "%s\tSlice: %s\n"
+ "%s\tCGroup: %s\n"
+ "%s\tCGroup realized: %s\n",
+ prefix, strna(unit_slice_name(u)),
+ prefix, strna(u->cgroup_path),
+ prefix, yes_no(u->cgroup_realized));
+
+ if (u->cgroup_realized_mask != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(u->cgroup_realized_mask, &s);
+ fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s));
+ }
+
+ if (u->cgroup_enabled_mask != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(u->cgroup_enabled_mask, &s);
+ fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_own_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_members_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s));
+ }
+
+ m = unit_get_delegate_mask(u);
+ if (m != 0) {
+ _cleanup_free_ char *s = NULL;
+ (void) cg_mask_to_string(m, &s);
+ fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s));
+ }
+ }
+
+ if (!sd_id128_is_null(u->invocation_id))
+ fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n",
+ prefix, SD_ID128_FORMAT_VAL(u->invocation_id));
+
+ STRV_FOREACH(j, u->documentation)
+ fprintf(f, "%s\tDocumentation: %s\n", prefix, *j);
+
+ if (u->access_selinux_context)
+ fprintf(f, "%s\tAccess SELinux Context: %s\n", prefix, u->access_selinux_context);
+
+ following = unit_following(u);
+ if (following)
+ fprintf(f, "%s\tFollowing: %s\n", prefix, following->id);
+
+ r = unit_following_set(u, &following_set);
+ if (r >= 0) {
+ Unit *other;
+
+ SET_FOREACH(other, following_set)
+ fprintf(f, "%s\tFollowing Set Member: %s\n", prefix, other->id);
+ }
+
+ if (u->fragment_path)
+ fprintf(f, "%s\tFragment Path: %s\n", prefix, u->fragment_path);
+
+ if (u->source_path)
+ fprintf(f, "%s\tSource Path: %s\n", prefix, u->source_path);
+
+ STRV_FOREACH(j, u->dropin_paths)
+ fprintf(f, "%s\tDropIn Path: %s\n", prefix, *j);
+
+ if (u->failure_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tFailure Action: %s\n", prefix, emergency_action_to_string(u->failure_action));
+ if (u->failure_action_exit_status >= 0)
+ fprintf(f, "%s\tFailure Action Exit Status: %i\n", prefix, u->failure_action_exit_status);
+ if (u->success_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tSuccess Action: %s\n", prefix, emergency_action_to_string(u->success_action));
+ if (u->success_action_exit_status >= 0)
+ fprintf(f, "%s\tSuccess Action Exit Status: %i\n", prefix, u->success_action_exit_status);
+
+ if (u->job_timeout != USEC_INFINITY)
+ fprintf(f, "%s\tJob Timeout: %s\n", prefix, FORMAT_TIMESPAN(u->job_timeout, 0));
+
+ if (u->job_timeout_action != EMERGENCY_ACTION_NONE)
+ fprintf(f, "%s\tJob Timeout Action: %s\n", prefix, emergency_action_to_string(u->job_timeout_action));
+
+ if (u->job_timeout_reboot_arg)
+ fprintf(f, "%s\tJob Timeout Reboot Argument: %s\n", prefix, u->job_timeout_reboot_arg);
+
+ condition_dump_list(u->conditions, f, prefix, condition_type_to_string);
+ condition_dump_list(u->asserts, f, prefix, assert_type_to_string);
+
+ if (dual_timestamp_is_set(&u->condition_timestamp))
+ fprintf(f,
+ "%s\tCondition Timestamp: %s\n"
+ "%s\tCondition Result: %s\n",
+ prefix, strna(FORMAT_TIMESTAMP(u->condition_timestamp.realtime)),
+ prefix, yes_no(u->condition_result));
+
+ if (dual_timestamp_is_set(&u->assert_timestamp))
+ fprintf(f,
+ "%s\tAssert Timestamp: %s\n"
+ "%s\tAssert Result: %s\n",
+ prefix, strna(FORMAT_TIMESTAMP(u->assert_timestamp.realtime)),
+ prefix, yes_no(u->assert_result));
+
+ for (UnitDependency d = 0; d < _UNIT_DEPENDENCY_MAX; d++) {
+ UnitDependencyInfo di;
+ Unit *other;
+
+ HASHMAP_FOREACH_KEY(di.data, other, unit_get_dependencies(u, d)) {
+ bool space = false;
+
+ fprintf(f, "%s\t%s: %s (", prefix, unit_dependency_to_string(d), other->id);
+
+ print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+ print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+ fputs(")\n", f);
+ }
+ }
+
+ if (!hashmap_isempty(u->requires_mounts_for)) {
+ UnitDependencyInfo di;
+ const char *path;
+
+ HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+ bool space = false;
+
+ fprintf(f, "%s\tRequiresMountsFor: %s (", prefix, path);
+
+ print_unit_dependency_mask(f, "origin", di.origin_mask, &space);
+ print_unit_dependency_mask(f, "destination", di.destination_mask, &space);
+
+ fputs(")\n", f);
+ }
+ }
+
+ if (u->load_state == UNIT_LOADED) {
+
+ fprintf(f,
+ "%s\tStopWhenUnneeded: %s\n"
+ "%s\tRefuseManualStart: %s\n"
+ "%s\tRefuseManualStop: %s\n"
+ "%s\tDefaultDependencies: %s\n"
+ "%s\tSurviveFinalKillSignal: %s\n"
+ "%s\tOnSuccessJobMode: %s\n"
+ "%s\tOnFailureJobMode: %s\n"
+ "%s\tIgnoreOnIsolate: %s\n",
+ prefix, yes_no(u->stop_when_unneeded),
+ prefix, yes_no(u->refuse_manual_start),
+ prefix, yes_no(u->refuse_manual_stop),
+ prefix, yes_no(u->default_dependencies),
+ prefix, yes_no(u->survive_final_kill_signal),
+ prefix, job_mode_to_string(u->on_success_job_mode),
+ prefix, job_mode_to_string(u->on_failure_job_mode),
+ prefix, yes_no(u->ignore_on_isolate));
+
+ if (UNIT_VTABLE(u)->dump)
+ UNIT_VTABLE(u)->dump(u, f, prefix2);
+
+ } else if (u->load_state == UNIT_MERGED)
+ fprintf(f,
+ "%s\tMerged into: %s\n",
+ prefix, u->merged_into->id);
+ else if (u->load_state == UNIT_ERROR) {
+ errno = abs(u->load_error);
+ fprintf(f, "%s\tLoad Error Code: %m\n", prefix);
+ }
+
+ for (const char *n = sd_bus_track_first(u->bus_track); n; n = sd_bus_track_next(u->bus_track))
+ fprintf(f, "%s\tBus Ref: %s\n", prefix, n);
+
+ if (u->job)
+ job_dump(u->job, f, prefix2);
+
+ if (u->nop_job)
+ job_dump(u->nop_job, f, prefix2);
+}
diff --git a/src/core/unit-serialize.h b/src/core/unit-serialize.h
new file mode 100644
index 0000000..ab8a8e3
--- /dev/null
+++ b/src/core/unit-serialize.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "unit.h"
+#include "fdset.h"
+
+/* These functions serialize state for our own usage, i.e.: across a reload/reexec, rather than for being
+ * passed to a child process. */
+
+int unit_serialize_state(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs);
+int unit_deserialize_state(Unit *u, FILE *f, FDSet *fds);
+int unit_deserialize_state_skip(FILE *f);
+
+void unit_dump(Unit *u, FILE *f, const char *prefix);
diff --git a/src/core/unit.c b/src/core/unit.c
new file mode 100644
index 0000000..2fc9f5a
--- /dev/null
+++ b/src/core/unit.c
@@ -0,0 +1,6617 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+#include "sd-messages.h"
+
+#include "all-units.h"
+#include "alloc-util.h"
+#include "bpf-firewall.h"
+#include "bpf-foreign.h"
+#include "bpf-socket-bind.h"
+#include "bus-common-errors.h"
+#include "bus-internal.h"
+#include "bus-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "chase.h"
+#include "core-varlink.h"
+#include "dbus-unit.h"
+#include "dbus.h"
+#include "dropin.h"
+#include "env-util.h"
+#include "escape.h"
+#include "exec-credential.h"
+#include "execute.h"
+#include "fd-util.h"
+#include "fileio-label.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "id128-util.h"
+#include "install.h"
+#include "iovec-util.h"
+#include "label-util.h"
+#include "load-dropin.h"
+#include "load-fragment.h"
+#include "log.h"
+#include "logarithm.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "serialize.h"
+#include "set.h"
+#include "signal-util.h"
+#include "sparse-endian.h"
+#include "special.h"
+#include "specifier.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "unit-name.h"
+#include "unit.h"
+#include "user-util.h"
+#include "virt.h"
+#if BPF_FRAMEWORK
+#include "bpf-link.h"
+#endif
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define MENTIONWORTHY_CPU_NSEC (1 * NSEC_PER_SEC)
+#define MENTIONWORTHY_IO_BYTES (1024 * 1024ULL)
+#define MENTIONWORTHY_IP_BYTES (0ULL)
+
+/* Thresholds for logging at INFO level about resource consumption */
+#define NOTICEWORTHY_CPU_NSEC (10*60 * NSEC_PER_SEC) /* 10 minutes */
+#define NOTICEWORTHY_IO_BYTES (10 * 1024 * 1024ULL) /* 10 MB */
+#define NOTICEWORTHY_IP_BYTES (128 * 1024 * 1024ULL) /* 128 MB */
+
+const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX] = {
+ [UNIT_SERVICE] = &service_vtable,
+ [UNIT_SOCKET] = &socket_vtable,
+ [UNIT_TARGET] = &target_vtable,
+ [UNIT_DEVICE] = &device_vtable,
+ [UNIT_MOUNT] = &mount_vtable,
+ [UNIT_AUTOMOUNT] = &automount_vtable,
+ [UNIT_SWAP] = &swap_vtable,
+ [UNIT_TIMER] = &timer_vtable,
+ [UNIT_PATH] = &path_vtable,
+ [UNIT_SLICE] = &slice_vtable,
+ [UNIT_SCOPE] = &scope_vtable,
+};
+
+Unit* unit_new(Manager *m, size_t size) {
+ Unit *u;
+
+ assert(m);
+ assert(size >= sizeof(Unit));
+
+ u = malloc0(size);
+ if (!u)
+ return NULL;
+
+ u->manager = m;
+ u->type = _UNIT_TYPE_INVALID;
+ u->default_dependencies = true;
+ u->unit_file_state = _UNIT_FILE_STATE_INVALID;
+ u->unit_file_preset = -1;
+ u->on_failure_job_mode = JOB_REPLACE;
+ u->on_success_job_mode = JOB_FAIL;
+ u->cgroup_control_inotify_wd = -1;
+ u->cgroup_memory_inotify_wd = -1;
+ u->job_timeout = USEC_INFINITY;
+ u->job_running_timeout = USEC_INFINITY;
+ u->ref_uid = UID_INVALID;
+ u->ref_gid = GID_INVALID;
+ u->cpu_usage_last = NSEC_INFINITY;
+
+ unit_reset_memory_accounting_last(u);
+
+ unit_reset_io_accounting_last(u);
+
+ u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
+ u->failure_action_exit_status = u->success_action_exit_status = -1;
+
+ u->ip_accounting_ingress_map_fd = -EBADF;
+ u->ip_accounting_egress_map_fd = -EBADF;
+
+ u->ipv4_allow_map_fd = -EBADF;
+ u->ipv6_allow_map_fd = -EBADF;
+ u->ipv4_deny_map_fd = -EBADF;
+ u->ipv6_deny_map_fd = -EBADF;
+
+ u->last_section_private = -1;
+
+ u->start_ratelimit = (const RateLimit) {
+ m->defaults.start_limit_interval,
+ m->defaults.start_limit_burst,
+ };
+
+ u->auto_start_stop_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_SEC, .burst = 16 };
+
+ return u;
+}
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret) {
+ _cleanup_(unit_freep) Unit *u = NULL;
+ int r;
+
+ u = unit_new(m, size);
+ if (!u)
+ return -ENOMEM;
+
+ r = unit_add_name(u, name);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(u);
+
+ return r;
+}
+
+bool unit_has_name(const Unit *u, const char *name) {
+ assert(u);
+ assert(name);
+
+ return streq_ptr(name, u->id) ||
+ set_contains(u->aliases, name);
+}
+
+static void unit_init(Unit *u) {
+ CGroupContext *cc;
+ ExecContext *ec;
+ KillContext *kc;
+
+ assert(u);
+ assert(u->manager);
+ assert(u->type >= 0);
+
+ cc = unit_get_cgroup_context(u);
+ if (cc) {
+ cgroup_context_init(cc);
+
+ /* Copy in the manager defaults into the cgroup
+ * context, _before_ the rest of the settings have
+ * been initialized */
+
+ cc->cpu_accounting = u->manager->defaults.cpu_accounting;
+ cc->io_accounting = u->manager->defaults.io_accounting;
+ cc->blockio_accounting = u->manager->defaults.blockio_accounting;
+ cc->memory_accounting = u->manager->defaults.memory_accounting;
+ cc->tasks_accounting = u->manager->defaults.tasks_accounting;
+ cc->ip_accounting = u->manager->defaults.ip_accounting;
+
+ if (u->type != UNIT_SLICE)
+ cc->tasks_max = u->manager->defaults.tasks_max;
+
+ cc->memory_pressure_watch = u->manager->defaults.memory_pressure_watch;
+ cc->memory_pressure_threshold_usec = u->manager->defaults.memory_pressure_threshold_usec;
+ }
+
+ ec = unit_get_exec_context(u);
+ if (ec) {
+ exec_context_init(ec);
+
+ if (u->manager->defaults.oom_score_adjust_set) {
+ ec->oom_score_adjust = u->manager->defaults.oom_score_adjust;
+ ec->oom_score_adjust_set = true;
+ }
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ ec->keyring_mode = EXEC_KEYRING_SHARED;
+ else {
+ ec->keyring_mode = EXEC_KEYRING_INHERIT;
+
+ /* User manager might have its umask redefined by PAM or UMask=. In this
+ * case let the units it manages inherit this value by default. They can
+ * still tune this value through their own unit file */
+ (void) get_process_umask(0, &ec->umask);
+ }
+ }
+
+ kc = unit_get_kill_context(u);
+ if (kc)
+ kill_context_init(kc);
+
+ if (UNIT_VTABLE(u)->init)
+ UNIT_VTABLE(u)->init(u);
+}
+
+static int unit_add_alias(Unit *u, char *donated_name) {
+ int r;
+
+ /* Make sure that u->names is allocated. We may leave u->names
+ * empty if we fail later, but this is not a problem. */
+ r = set_ensure_put(&u->aliases, &string_hash_ops, donated_name);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+
+ return 0;
+}
+
+int unit_add_name(Unit *u, const char *text) {
+ _cleanup_free_ char *name = NULL, *instance = NULL;
+ UnitType t;
+ int r;
+
+ assert(u);
+ assert(text);
+
+ if (unit_name_is_valid(text, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "instance is not set when adding name '%s': %m", text);
+
+ r = unit_name_replace_instance(text, u->instance, &name);
+ if (r < 0)
+ return log_unit_debug_errno(u, r,
+ "failed to build instance name from '%s': %m", text);
+ } else {
+ name = strdup(text);
+ if (!name)
+ return -ENOMEM;
+ }
+
+ if (unit_has_name(u, name))
+ return 0;
+
+ if (hashmap_contains(u->manager->units, name))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+ "unit already exist when adding name '%s': %m", name);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "name '%s' is invalid: %m", name);
+
+ t = unit_name_to_type(name);
+ if (t < 0)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "failed to derive unit type from name '%s': %m", name);
+
+ if (u->type != _UNIT_TYPE_INVALID && t != u->type)
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "unit type is illegal: u->type(%d) and t(%d) for name '%s': %m",
+ u->type, t, name);
+
+ r = unit_name_to_instance(name, &instance);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "failed to extract instance from name '%s': %m", name);
+
+ if (instance && !unit_type_may_template(t))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL), "templates are not allowed for name '%s': %m", name);
+
+ /* Ensure that this unit either has no instance, or that the instance matches. */
+ if (u->type != _UNIT_TYPE_INVALID && !streq_ptr(u->instance, instance))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "cannot add name %s, the instances don't match (\"%s\" != \"%s\").",
+ name, instance, u->instance);
+
+ if (u->id && !unit_type_may_alias(t))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EEXIST),
+ "cannot add name %s, aliases are not allowed for %s units.",
+ name, unit_type_to_string(t));
+
+ if (hashmap_size(u->manager->units) >= MANAGER_MAX_NAMES)
+ return log_unit_warning_errno(u, SYNTHETIC_ERRNO(E2BIG), "cannot add name, manager has too many units: %m");
+
+ /* Add name to the global hashmap first, because that's easier to undo */
+ r = hashmap_put(u->manager->units, name, u);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "add unit to hashmap failed for name '%s': %m", text);
+
+ if (u->id) {
+ r = unit_add_alias(u, name); /* unit_add_alias() takes ownership of the name on success */
+ if (r < 0) {
+ hashmap_remove(u->manager->units, name);
+ return r;
+ }
+ TAKE_PTR(name);
+
+ } else {
+ /* A new name, we don't need the set yet. */
+ assert(u->type == _UNIT_TYPE_INVALID);
+ assert(!u->instance);
+
+ u->type = t;
+ u->id = TAKE_PTR(name);
+ u->instance = TAKE_PTR(instance);
+
+ LIST_PREPEND(units_by_type, u->manager->units_by_type[t], u);
+ unit_init(u);
+ }
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+}
+
+int unit_choose_id(Unit *u, const char *name) {
+ _cleanup_free_ char *t = NULL;
+ char *s;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return -EINVAL;
+
+ r = unit_name_replace_instance(name, u->instance, &t);
+ if (r < 0)
+ return r;
+
+ name = t;
+ }
+
+ if (streq_ptr(u->id, name))
+ return 0; /* Nothing to do. */
+
+ /* Selects one of the aliases of this unit as the id */
+ s = set_get(u->aliases, (char*) name);
+ if (!s)
+ return -ENOENT;
+
+ if (u->id) {
+ r = set_remove_and_put(u->aliases, name, u->id);
+ if (r < 0)
+ return r;
+ } else
+ assert_se(set_remove(u->aliases, name)); /* see set_get() above… */
+
+ u->id = s; /* Old u->id is now stored in the set, and s is not stored anywhere */
+ unit_add_to_dbus_queue(u);
+
+ return 0;
+}
+
+int unit_set_description(Unit *u, const char *description) {
+ int r;
+
+ assert(u);
+
+ r = free_and_strdup(&u->description, empty_to_null(description));
+ if (r < 0)
+ return r;
+ if (r > 0)
+ unit_add_to_dbus_queue(u);
+
+ return 0;
+}
+
+static bool unit_success_failure_handler_has_jobs(Unit *unit) {
+ Unit *other;
+
+ UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_SUCCESS)
+ if (other->job || other->nop_job)
+ return true;
+
+ UNIT_FOREACH_DEPENDENCY(other, unit, UNIT_ATOM_ON_FAILURE)
+ if (other->job || other->nop_job)
+ return true;
+
+ return false;
+}
+
+void unit_release_resources(Unit *u) {
+ UnitActiveState state;
+ ExecContext *ec;
+
+ assert(u);
+
+ if (u->job || u->nop_job)
+ return;
+
+ if (u->perpetual)
+ return;
+
+ state = unit_active_state(u);
+ if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+ return;
+
+ if (unit_will_restart(u))
+ return;
+
+ ec = unit_get_exec_context(u);
+ if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART)
+ exec_context_destroy_runtime_directory(ec, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+
+ if (UNIT_VTABLE(u)->release_resources)
+ UNIT_VTABLE(u)->release_resources(u);
+}
+
+bool unit_may_gc(Unit *u) {
+ UnitActiveState state;
+ int r;
+
+ assert(u);
+
+ /* Checks whether the unit is ready to be unloaded for garbage collection. Returns true when the
+ * unit may be collected, and false if there's some reason to keep it loaded.
+ *
+ * References from other units are *not* checked here. Instead, this is done in unit_gc_sweep(), but
+ * using markers to properly collect dependency loops.
+ */
+
+ if (u->job || u->nop_job)
+ return false;
+
+ if (u->perpetual)
+ return false;
+
+ /* if we saw a cgroup empty event for this unit, stay around until we processed it so that we remove
+ * the empty cgroup if possible. Similar, process any pending OOM events if they are already queued
+ * before we release the unit. */
+ if (u->in_cgroup_empty_queue || u->in_cgroup_oom_queue)
+ return false;
+
+ /* Make sure to send out D-Bus events before we unload the unit */
+ if (u->in_dbus_queue)
+ return false;
+
+ if (sd_bus_track_count(u->bus_track) > 0)
+ return false;
+
+ state = unit_active_state(u);
+
+ /* But we keep the unit object around for longer when it is referenced or configured to not be
+ * gc'ed */
+ switch (u->collect_mode) {
+
+ case COLLECT_INACTIVE:
+ if (state != UNIT_INACTIVE)
+ return false;
+
+ break;
+
+ case COLLECT_INACTIVE_OR_FAILED:
+ if (!IN_SET(state, UNIT_INACTIVE, UNIT_FAILED))
+ return false;
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* Check if any OnFailure= or on Success= jobs may be pending */
+ if (unit_success_failure_handler_has_jobs(u))
+ return false;
+
+ if (u->cgroup_path) {
+ /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
+ * around. Units with active processes should never be collected. */
+
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0)
+ log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", empty_to_root(u->cgroup_path));
+ if (r <= 0)
+ return false;
+ }
+
+ if (!UNIT_VTABLE(u)->may_gc)
+ return true;
+
+ return UNIT_VTABLE(u)->may_gc(u);
+}
+
+void unit_add_to_load_queue(Unit *u) {
+ assert(u);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ if (u->load_state != UNIT_STUB || u->in_load_queue)
+ return;
+
+ LIST_PREPEND(load_queue, u->manager->load_queue, u);
+ u->in_load_queue = true;
+}
+
+void unit_add_to_cleanup_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_cleanup_queue)
+ return;
+
+ LIST_PREPEND(cleanup_queue, u->manager->cleanup_queue, u);
+ u->in_cleanup_queue = true;
+}
+
+void unit_add_to_gc_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_gc_queue || u->in_cleanup_queue)
+ return;
+
+ if (!unit_may_gc(u))
+ return;
+
+ LIST_PREPEND(gc_queue, u->manager->gc_unit_queue, u);
+ u->in_gc_queue = true;
+}
+
+void unit_add_to_dbus_queue(Unit *u) {
+ assert(u);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ if (u->load_state == UNIT_STUB || u->in_dbus_queue)
+ return;
+
+ /* Shortcut things if nobody cares */
+ if (sd_bus_track_count(u->manager->subscribed) <= 0 &&
+ sd_bus_track_count(u->bus_track) <= 0 &&
+ set_isempty(u->manager->private_buses)) {
+ u->sent_dbus_new_signal = true;
+ return;
+ }
+
+ LIST_PREPEND(dbus_queue, u->manager->dbus_unit_queue, u);
+ u->in_dbus_queue = true;
+}
+
+void unit_submit_to_stop_when_unneeded_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_stop_when_unneeded_queue)
+ return;
+
+ if (!u->stop_when_unneeded)
+ return;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ LIST_PREPEND(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+ u->in_stop_when_unneeded_queue = true;
+}
+
+void unit_submit_to_start_when_upheld_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_start_when_upheld_queue)
+ return;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ return;
+
+ if (!unit_has_dependency(u, UNIT_ATOM_START_STEADILY, NULL))
+ return;
+
+ LIST_PREPEND(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+ u->in_start_when_upheld_queue = true;
+}
+
+void unit_submit_to_stop_when_bound_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_stop_when_bound_queue)
+ return;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(u)))
+ return;
+
+ if (!unit_has_dependency(u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT, NULL))
+ return;
+
+ LIST_PREPEND(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+ u->in_stop_when_bound_queue = true;
+}
+
+static bool unit_can_release_resources(Unit *u) {
+ ExecContext *ec;
+
+ assert(u);
+
+ if (UNIT_VTABLE(u)->release_resources)
+ return true;
+
+ ec = unit_get_exec_context(u);
+ if (ec && ec->runtime_directory_preserve_mode == EXEC_PRESERVE_RESTART)
+ return true;
+
+ return false;
+}
+
+void unit_submit_to_release_resources_queue(Unit *u) {
+ assert(u);
+
+ if (u->in_release_resources_queue)
+ return;
+
+ if (u->job || u->nop_job)
+ return;
+
+ if (u->perpetual)
+ return;
+
+ if (!unit_can_release_resources(u))
+ return;
+
+ LIST_PREPEND(release_resources_queue, u->manager->release_resources_queue, u);
+ u->in_release_resources_queue = true;
+}
+
+static void unit_clear_dependencies(Unit *u) {
+ assert(u);
+
+ /* Removes all dependencies configured on u and their reverse dependencies. */
+
+ for (Hashmap *deps; (deps = hashmap_steal_first(u->dependencies));) {
+
+ for (Unit *other; (other = hashmap_steal_first_key(deps));) {
+ Hashmap *other_deps;
+
+ HASHMAP_FOREACH(other_deps, other->dependencies)
+ hashmap_remove(other_deps, u);
+
+ unit_add_to_gc_queue(other);
+ }
+
+ hashmap_free(deps);
+ }
+
+ u->dependencies = hashmap_free(u->dependencies);
+}
+
+static void unit_remove_transient(Unit *u) {
+ assert(u);
+
+ if (!u->transient)
+ return;
+
+ if (u->fragment_path)
+ (void) unlink(u->fragment_path);
+
+ STRV_FOREACH(i, u->dropin_paths) {
+ _cleanup_free_ char *p = NULL, *pp = NULL;
+
+ if (path_extract_directory(*i, &p) < 0) /* Get the drop-in directory from the drop-in file */
+ continue;
+
+ if (path_extract_directory(p, &pp) < 0) /* Get the config directory from the drop-in directory */
+ continue;
+
+ /* Only drop transient drop-ins */
+ if (!path_equal(u->manager->lookup_paths.transient, pp))
+ continue;
+
+ (void) unlink(*i);
+ (void) rmdir(p);
+ }
+}
+
+static void unit_free_requires_mounts_for(Unit *u) {
+ assert(u);
+
+ for (;;) {
+ _cleanup_free_ char *path = NULL;
+
+ path = hashmap_steal_first_key(u->requires_mounts_for);
+ if (!path)
+ break;
+ else {
+ char s[strlen(path) + 1];
+
+ PATH_FOREACH_PREFIX_MORE(s, path) {
+ char *y;
+ Set *x;
+
+ x = hashmap_get2(u->manager->units_requiring_mounts_for, s, (void**) &y);
+ if (!x)
+ continue;
+
+ (void) set_remove(x, u);
+
+ if (set_isempty(x)) {
+ (void) hashmap_remove(u->manager->units_requiring_mounts_for, y);
+ free(y);
+ set_free(x);
+ }
+ }
+ }
+ }
+
+ u->requires_mounts_for = hashmap_free(u->requires_mounts_for);
+}
+
+static void unit_done(Unit *u) {
+ ExecContext *ec;
+ CGroupContext *cc;
+
+ assert(u);
+
+ if (u->type < 0)
+ return;
+
+ if (UNIT_VTABLE(u)->done)
+ UNIT_VTABLE(u)->done(u);
+
+ ec = unit_get_exec_context(u);
+ if (ec)
+ exec_context_done(ec);
+
+ cc = unit_get_cgroup_context(u);
+ if (cc)
+ cgroup_context_done(cc);
+}
+
+Unit* unit_free(Unit *u) {
+ Unit *slice;
+ char *t;
+
+ if (!u)
+ return NULL;
+
+ sd_event_source_disable_unref(u->auto_start_stop_event_source);
+
+ u->transient_file = safe_fclose(u->transient_file);
+
+ if (!MANAGER_IS_RELOADING(u->manager))
+ unit_remove_transient(u);
+
+ bus_unit_send_removed_signal(u);
+
+ unit_done(u);
+
+ unit_dequeue_rewatch_pids(u);
+
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->bus_track = sd_bus_track_unref(u->bus_track);
+ u->deserialized_refs = strv_free(u->deserialized_refs);
+ u->pending_freezer_invocation = sd_bus_message_unref(u->pending_freezer_invocation);
+
+ unit_free_requires_mounts_for(u);
+
+ SET_FOREACH(t, u->aliases)
+ hashmap_remove_value(u->manager->units, t, u);
+ if (u->id)
+ hashmap_remove_value(u->manager->units, u->id, u);
+
+ if (!sd_id128_is_null(u->invocation_id))
+ hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+ if (u->job) {
+ Job *j = u->job;
+ job_uninstall(j);
+ job_free(j);
+ }
+
+ if (u->nop_job) {
+ Job *j = u->nop_job;
+ job_uninstall(j);
+ job_free(j);
+ }
+
+ /* A unit is being dropped from the tree, make sure our family is realized properly. Do this after we
+ * detach the unit from slice tree in order to eliminate its effect on controller masks. */
+ slice = UNIT_GET_SLICE(u);
+ unit_clear_dependencies(u);
+ if (slice)
+ unit_add_family_to_cgroup_realize_queue(slice);
+
+ if (u->on_console)
+ manager_unref_console(u->manager);
+
+ fdset_free(u->initial_socket_bind_link_fds);
+#if BPF_FRAMEWORK
+ bpf_link_free(u->ipv4_socket_bind_link);
+ bpf_link_free(u->ipv6_socket_bind_link);
+#endif
+
+ unit_release_cgroup(u);
+
+ if (!MANAGER_IS_RELOADING(u->manager))
+ unit_unlink_state_files(u);
+
+ unit_unref_uid_gid(u, false);
+
+ (void) manager_update_failed_units(u->manager, u, false);
+ set_remove(u->manager->startup_units, u);
+
+ unit_unwatch_all_pids(u);
+
+ while (u->refs_by_target)
+ unit_ref_unset(u->refs_by_target);
+
+ if (u->type != _UNIT_TYPE_INVALID)
+ LIST_REMOVE(units_by_type, u->manager->units_by_type[u->type], u);
+
+ if (u->in_load_queue)
+ LIST_REMOVE(load_queue, u->manager->load_queue, u);
+
+ if (u->in_dbus_queue)
+ LIST_REMOVE(dbus_queue, u->manager->dbus_unit_queue, u);
+
+ if (u->in_cleanup_queue)
+ LIST_REMOVE(cleanup_queue, u->manager->cleanup_queue, u);
+
+ if (u->in_gc_queue)
+ LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u);
+
+ if (u->in_cgroup_realize_queue)
+ LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+
+ if (u->in_cgroup_empty_queue)
+ LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+
+ if (u->in_cgroup_oom_queue)
+ LIST_REMOVE(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+
+ if (u->in_target_deps_queue)
+ LIST_REMOVE(target_deps_queue, u->manager->target_deps_queue, u);
+
+ if (u->in_stop_when_unneeded_queue)
+ LIST_REMOVE(stop_when_unneeded_queue, u->manager->stop_when_unneeded_queue, u);
+
+ if (u->in_start_when_upheld_queue)
+ LIST_REMOVE(start_when_upheld_queue, u->manager->start_when_upheld_queue, u);
+
+ if (u->in_stop_when_bound_queue)
+ LIST_REMOVE(stop_when_bound_queue, u->manager->stop_when_bound_queue, u);
+
+ if (u->in_release_resources_queue)
+ LIST_REMOVE(release_resources_queue, u->manager->release_resources_queue, u);
+
+ bpf_firewall_close(u);
+
+ hashmap_free(u->bpf_foreign_by_key);
+
+ bpf_program_free(u->bpf_device_control_installed);
+
+#if BPF_FRAMEWORK
+ bpf_link_free(u->restrict_ifaces_ingress_bpf_link);
+ bpf_link_free(u->restrict_ifaces_egress_bpf_link);
+#endif
+ fdset_free(u->initial_restric_ifaces_link_fds);
+
+ condition_free_list(u->conditions);
+ condition_free_list(u->asserts);
+
+ free(u->description);
+ strv_free(u->documentation);
+ free(u->fragment_path);
+ free(u->source_path);
+ strv_free(u->dropin_paths);
+ free(u->instance);
+
+ free(u->job_timeout_reboot_arg);
+ free(u->reboot_arg);
+
+ free(u->access_selinux_context);
+
+ set_free_free(u->aliases);
+ free(u->id);
+
+ activation_details_unref(u->activation_details);
+
+ return mfree(u);
+}
+
+FreezerState unit_freezer_state(Unit *u) {
+ assert(u);
+
+ return u->freezer_state;
+}
+
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret) {
+ char *values[1] = {};
+ int r;
+
+ assert(u);
+
+ r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
+ STRV_MAKE("frozen"), values);
+ if (r < 0)
+ return r;
+
+ r = _FREEZER_STATE_INVALID;
+
+ if (values[0]) {
+ if (streq(values[0], "0"))
+ r = FREEZER_RUNNING;
+ else if (streq(values[0], "1"))
+ r = FREEZER_FROZEN;
+ }
+
+ free(values[0]);
+ *ret = r;
+
+ return 0;
+}
+
+UnitActiveState unit_active_state(Unit *u) {
+ assert(u);
+
+ if (u->load_state == UNIT_MERGED)
+ return unit_active_state(unit_follow_merge(u));
+
+ /* After a reload it might happen that a unit is not correctly
+ * loaded but still has a process around. That's why we won't
+ * shortcut failed loading to UNIT_INACTIVE_FAILED. */
+
+ return UNIT_VTABLE(u)->active_state(u);
+}
+
+const char* unit_sub_state_to_string(Unit *u) {
+ assert(u);
+
+ return UNIT_VTABLE(u)->sub_state_to_string(u);
+}
+
+static int unit_merge_names(Unit *u, Unit *other) {
+ char *name;
+ int r;
+
+ assert(u);
+ assert(other);
+
+ r = unit_add_alias(u, other->id);
+ if (r < 0)
+ return r;
+
+ r = set_move(u->aliases, other->aliases);
+ if (r < 0) {
+ set_remove(u->aliases, other->id);
+ return r;
+ }
+
+ TAKE_PTR(other->id);
+ other->aliases = set_free_free(other->aliases);
+
+ SET_FOREACH(name, u->aliases)
+ assert_se(hashmap_replace(u->manager->units, name, u) == 0);
+
+ return 0;
+}
+
+static int unit_reserve_dependencies(Unit *u, Unit *other) {
+ size_t n_reserve;
+ Hashmap* deps;
+ void *d;
+ int r;
+
+ assert(u);
+ assert(other);
+
+ /* Let's reserve some space in the dependency hashmaps so that later on merging the units cannot
+ * fail.
+ *
+ * First make some room in the per dependency type hashmaps. Using the summed size of both units'
+ * hashmaps is an estimate that is likely too high since they probably use some of the same
+ * types. But it's never too low, and that's all we need. */
+
+ n_reserve = MIN(hashmap_size(other->dependencies), LESS_BY((size_t) _UNIT_DEPENDENCY_MAX, hashmap_size(u->dependencies)));
+ if (n_reserve > 0) {
+ r = hashmap_ensure_allocated(&u->dependencies, NULL);
+ if (r < 0)
+ return r;
+
+ r = hashmap_reserve(u->dependencies, n_reserve);
+ if (r < 0)
+ return r;
+ }
+
+ /* Now, enlarge our per dependency type hashmaps by the number of entries in the same hashmap of the
+ * other unit's dependencies.
+ *
+ * NB: If u does not have a dependency set allocated for some dependency type, there is no need to
+ * reserve anything for. In that case other's set will be transferred as a whole to u by
+ * complete_move(). */
+
+ HASHMAP_FOREACH_KEY(deps, d, u->dependencies) {
+ Hashmap *other_deps;
+
+ other_deps = hashmap_get(other->dependencies, d);
+
+ r = hashmap_reserve(deps, hashmap_size(other_deps));
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static bool unit_should_warn_about_dependency(UnitDependency dependency) {
+ /* Only warn about some unit types */
+ return IN_SET(dependency,
+ UNIT_CONFLICTS,
+ UNIT_CONFLICTED_BY,
+ UNIT_BEFORE,
+ UNIT_AFTER,
+ UNIT_ON_SUCCESS,
+ UNIT_ON_FAILURE,
+ UNIT_TRIGGERS,
+ UNIT_TRIGGERED_BY);
+}
+
+static int unit_per_dependency_type_hashmap_update(
+ Hashmap *per_type,
+ Unit *other,
+ UnitDependencyMask origin_mask,
+ UnitDependencyMask destination_mask) {
+
+ UnitDependencyInfo info;
+ int r;
+
+ assert(other);
+ assert_cc(sizeof(void*) == sizeof(info));
+
+ /* Acquire the UnitDependencyInfo entry for the Unit* we are interested in, and update it if it
+ * exists, or insert it anew if not. */
+
+ info.data = hashmap_get(per_type, other);
+ if (info.data) {
+ /* Entry already exists. Add in our mask. */
+
+ if (FLAGS_SET(origin_mask, info.origin_mask) &&
+ FLAGS_SET(destination_mask, info.destination_mask))
+ return 0; /* NOP */
+
+ info.origin_mask |= origin_mask;
+ info.destination_mask |= destination_mask;
+
+ r = hashmap_update(per_type, other, info.data);
+ } else {
+ info = (UnitDependencyInfo) {
+ .origin_mask = origin_mask,
+ .destination_mask = destination_mask,
+ };
+
+ r = hashmap_put(per_type, other, info.data);
+ }
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static void unit_merge_dependencies(Unit *u, Unit *other) {
+ Hashmap *deps;
+ void *dt; /* Actually of type UnitDependency, except that we don't bother casting it here,
+ * since the hashmaps all want it as void pointer. */
+
+ assert(u);
+ assert(other);
+
+ if (u == other)
+ return;
+
+ /* First, remove dependency to other. */
+ HASHMAP_FOREACH_KEY(deps, dt, u->dependencies) {
+ if (hashmap_remove(deps, other) && unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+ log_unit_warning(u, "Dependency %s=%s is dropped, as %s is merged into %s.",
+ unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+ other->id, other->id, u->id);
+
+ if (hashmap_isempty(deps))
+ hashmap_free(hashmap_remove(u->dependencies, dt));
+ }
+
+ for (;;) {
+ _cleanup_hashmap_free_ Hashmap *other_deps = NULL;
+ UnitDependencyInfo di_back;
+ Unit *back;
+
+ /* Let's focus on one dependency type at a time, that 'other' has defined. */
+ other_deps = hashmap_steal_first_key_and_value(other->dependencies, &dt);
+ if (!other_deps)
+ break; /* done! */
+
+ deps = hashmap_get(u->dependencies, dt);
+
+ /* Now iterate through all dependencies of this dependency type, of 'other'. We refer to the
+ * referenced units as 'back'. */
+ HASHMAP_FOREACH_KEY(di_back.data, back, other_deps) {
+ Hashmap *back_deps;
+ void *back_dt;
+
+ if (back == u) {
+ /* This is a dependency pointing back to the unit we want to merge with?
+ * Suppress it (but warn) */
+ if (unit_should_warn_about_dependency(UNIT_DEPENDENCY_FROM_PTR(dt)))
+ log_unit_warning(u, "Dependency %s=%s in %s is dropped, as %s is merged into %s.",
+ unit_dependency_to_string(UNIT_DEPENDENCY_FROM_PTR(dt)),
+ u->id, other->id, other->id, u->id);
+
+ hashmap_remove(other_deps, back);
+ continue;
+ }
+
+ /* Now iterate through all deps of 'back', and fix the ones pointing to 'other' to
+ * point to 'u' instead. */
+ HASHMAP_FOREACH_KEY(back_deps, back_dt, back->dependencies) {
+ UnitDependencyInfo di_move;
+
+ di_move.data = hashmap_remove(back_deps, other);
+ if (!di_move.data)
+ continue;
+
+ assert_se(unit_per_dependency_type_hashmap_update(
+ back_deps,
+ u,
+ di_move.origin_mask,
+ di_move.destination_mask) >= 0);
+ }
+
+ /* The target unit already has dependencies of this type, let's then merge this individually. */
+ if (deps)
+ assert_se(unit_per_dependency_type_hashmap_update(
+ deps,
+ back,
+ di_back.origin_mask,
+ di_back.destination_mask) >= 0);
+ }
+
+ /* Now all references towards 'other' of the current type 'dt' are corrected to point to 'u'.
+ * Lets's now move the deps of type 'dt' from 'other' to 'u'. If the unit does not have
+ * dependencies of this type, let's move them per type wholesale. */
+ if (!deps)
+ assert_se(hashmap_put(u->dependencies, dt, TAKE_PTR(other_deps)) >= 0);
+ }
+
+ other->dependencies = hashmap_free(other->dependencies);
+}
+
+int unit_merge(Unit *u, Unit *other) {
+ int r;
+
+ assert(u);
+ assert(other);
+ assert(u->manager == other->manager);
+ assert(u->type != _UNIT_TYPE_INVALID);
+
+ other = unit_follow_merge(other);
+
+ if (other == u)
+ return 0;
+
+ if (u->type != other->type)
+ return -EINVAL;
+
+ if (!unit_type_may_alias(u->type)) /* Merging only applies to unit names that support aliases */
+ return -EEXIST;
+
+ if (!IN_SET(other->load_state, UNIT_STUB, UNIT_NOT_FOUND))
+ return -EEXIST;
+
+ if (!streq_ptr(u->instance, other->instance))
+ return -EINVAL;
+
+ if (other->job)
+ return -EEXIST;
+
+ if (other->nop_job)
+ return -EEXIST;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+ return -EEXIST;
+
+ /* Make reservations to ensure merge_dependencies() won't fail. We don't rollback reservations if we
+ * fail. We don't have a way to undo reservations. A reservation is not a leak. */
+ r = unit_reserve_dependencies(u, other);
+ if (r < 0)
+ return r;
+
+ /* Redirect all references */
+ while (other->refs_by_target)
+ unit_ref_set(other->refs_by_target, other->refs_by_target->source, u);
+
+ /* Merge dependencies */
+ unit_merge_dependencies(u, other);
+
+ /* Merge names. It is better to do that after merging deps, otherwise the log message contains n/a. */
+ r = unit_merge_names(u, other);
+ if (r < 0)
+ return r;
+
+ other->load_state = UNIT_MERGED;
+ other->merged_into = u;
+
+ if (!u->activation_details)
+ u->activation_details = activation_details_ref(other->activation_details);
+
+ /* If there is still some data attached to the other node, we
+ * don't need it anymore, and can free it. */
+ if (other->load_state != UNIT_STUB)
+ if (UNIT_VTABLE(other)->done)
+ UNIT_VTABLE(other)->done(other);
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_cleanup_queue(other);
+
+ return 0;
+}
+
+int unit_merge_by_name(Unit *u, const char *name) {
+ _cleanup_free_ char *s = NULL;
+ Unit *other;
+ int r;
+
+ /* Either add name to u, or if a unit with name already exists, merge it with u.
+ * If name is a template, do the same for name@instance, where instance is u's instance. */
+
+ assert(u);
+ assert(name);
+
+ if (unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ if (!u->instance)
+ return -EINVAL;
+
+ r = unit_name_replace_instance(name, u->instance, &s);
+ if (r < 0)
+ return r;
+
+ name = s;
+ }
+
+ other = manager_get_unit(u->manager, name);
+ if (other)
+ return unit_merge(u, other);
+
+ return unit_add_name(u, name);
+}
+
+Unit* unit_follow_merge(Unit *u) {
+ assert(u);
+
+ while (u->load_state == UNIT_MERGED)
+ assert_se(u = u->merged_into);
+
+ return u;
+}
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
+ int r;
+
+ assert(u);
+ assert(c);
+
+ /* Unlike unit_add_dependency() or friends, this always returns 0 on success. */
+
+ if (c->working_directory && !c->working_directory_missing_ok) {
+ r = unit_require_mounts_for(u, c->working_directory, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_directory) {
+ r = unit_require_mounts_for(u, c->root_directory, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_image) {
+ r = unit_require_mounts_for(u, c->root_image, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
+ if (!u->manager->prefix[dt])
+ continue;
+
+ for (size_t i = 0; i < c->directories[dt].n_items; i++) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(u->manager->prefix[dt], c->directories[dt].items[i].path);
+ if (!p)
+ return -ENOMEM;
+
+ r = unit_require_mounts_for(u, p, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return 0;
+
+ /* For the following three directory types we need write access, and /var/ is possibly on the root
+ * fs. Hence order after systemd-remount-fs.service, to ensure things are writable. */
+ if (c->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
+ c->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
+ c->directories[EXEC_DIRECTORY_LOGS].n_items > 0) {
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_REMOUNT_FS_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->private_tmp) {
+
+ /* FIXME: for now we make a special case for /tmp and add a weak dependency on
+ * tmp.mount so /tmp being masked is supported. However there's no reason to treat
+ * /tmp specifically and masking other mount units should be handled more
+ * gracefully too, see PR#16894. */
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "tmp.mount", true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_require_mounts_for(u, "/var/tmp", UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_TMPFILES_SETUP_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (c->root_image) {
+ /* We need to wait for /dev/loopX to appear when doing RootImage=, hence let's add an
+ * implicit dependency on udev */
+
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_UDEVD_SERVICE, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ if (!IN_SET(c->std_output,
+ EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+ !IN_SET(c->std_error,
+ EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE,
+ EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE) &&
+ !c->log_namespace)
+ return 0;
+
+ /* If syslog or kernel logging is requested (or log namespacing is), make sure our own logging daemon
+ * is run first. */
+
+ if (c->log_namespace) {
+ _cleanup_free_ char *socket_unit = NULL, *varlink_socket_unit = NULL;
+
+ r = unit_name_build_from_type("systemd-journald", c->log_namespace, UNIT_SOCKET, &socket_unit);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, socket_unit, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ r = unit_name_build_from_type("systemd-journald-varlink", c->log_namespace, UNIT_SOCKET, &varlink_socket_unit);
+ if (r < 0)
+ return r;
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, varlink_socket_unit, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ } else {
+ r = unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_JOURNALD_SOCKET, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+
+ r = unit_add_default_credential_dependencies(u, c);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+const char* unit_description(Unit *u) {
+ assert(u);
+
+ if (u->description)
+ return u->description;
+
+ return strna(u->id);
+}
+
+const char* unit_status_string(Unit *u, char **ret_combined_buffer) {
+ assert(u);
+ assert(u->id);
+
+ /* Return u->id, u->description, or "{u->id} - {u->description}".
+ * Versions with u->description are only used if it is set.
+ * The last option is used if configured and the caller provided the 'ret_combined_buffer'
+ * pointer.
+ *
+ * Note that *ret_combined_buffer may be set to NULL. */
+
+ if (!u->description ||
+ u->manager->status_unit_format == STATUS_UNIT_FORMAT_NAME ||
+ (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && !ret_combined_buffer) ||
+ streq(u->description, u->id)) {
+
+ if (ret_combined_buffer)
+ *ret_combined_buffer = NULL;
+ return u->id;
+ }
+
+ if (ret_combined_buffer) {
+ if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED) {
+ *ret_combined_buffer = strjoin(u->id, " - ", u->description);
+ if (*ret_combined_buffer)
+ return *ret_combined_buffer;
+ log_oom(); /* Fall back to ->description */
+ } else
+ *ret_combined_buffer = NULL;
+ }
+
+ return u->description;
+}
+
+/* Common implementation for multiple backends */
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required) {
+ int r;
+
+ assert(u);
+
+ /* Load a .{service,socket,...} file */
+ r = unit_load_fragment(u);
+ if (r < 0)
+ return r;
+
+ if (u->load_state == UNIT_STUB) {
+ if (fragment_required)
+ return -ENOENT;
+
+ u->load_state = UNIT_LOADED;
+ }
+
+ /* Load drop-in directory data. If u is an alias, we might be reloading the
+ * target unit needlessly. But we cannot be sure which drops-ins have already
+ * been loaded and which not, at least without doing complicated book-keeping,
+ * so let's always reread all drop-ins. */
+ r = unit_load_dropin(unit_follow_merge(u));
+ if (r < 0)
+ return r;
+
+ if (u->source_path) {
+ struct stat st;
+
+ if (stat(u->source_path, &st) >= 0)
+ u->source_mtime = timespec_load(&st.st_mtim);
+ else
+ u->source_mtime = 0;
+ }
+
+ return 0;
+}
+
+void unit_add_to_target_deps_queue(Unit *u) {
+ Manager *m = ASSERT_PTR(ASSERT_PTR(u)->manager);
+
+ if (u->in_target_deps_queue)
+ return;
+
+ LIST_PREPEND(target_deps_queue, m->target_deps_queue, u);
+ u->in_target_deps_queue = true;
+}
+
+int unit_add_default_target_dependency(Unit *u, Unit *target) {
+ assert(u);
+ assert(target);
+
+ if (target->type != UNIT_TARGET)
+ return 0;
+
+ /* Only add the dependency if both units are loaded, so that
+ * that loop check below is reliable */
+ if (u->load_state != UNIT_LOADED ||
+ target->load_state != UNIT_LOADED)
+ return 0;
+
+ /* If either side wants no automatic dependencies, then let's
+ * skip this */
+ if (!u->default_dependencies ||
+ !target->default_dependencies)
+ return 0;
+
+ /* Don't create loops */
+ if (unit_has_dependency(target, UNIT_ATOM_BEFORE, u))
+ return 0;
+
+ return unit_add_dependency(target, UNIT_AFTER, u, true, UNIT_DEPENDENCY_DEFAULT);
+}
+
+static int unit_add_slice_dependencies(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return 0;
+
+ /* Slice units are implicitly ordered against their parent slices (as this relationship is encoded in the
+ name), while all other units are ordered based on configuration (as in their case Slice= configures the
+ relationship). */
+ UnitDependencyMask mask = u->type == UNIT_SLICE ? UNIT_DEPENDENCY_IMPLICIT : UNIT_DEPENDENCY_FILE;
+
+ slice = UNIT_GET_SLICE(u);
+ if (slice)
+ return unit_add_two_dependencies(u, UNIT_AFTER, UNIT_REQUIRES, slice, true, mask);
+
+ if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+ return 0;
+
+ return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_REQUIRES, SPECIAL_ROOT_SLICE, true, mask);
+}
+
+static int unit_add_mount_dependencies(Unit *u) {
+ UnitDependencyInfo di;
+ const char *path;
+ bool changed = false;
+ int r;
+
+ assert(u);
+
+ HASHMAP_FOREACH_KEY(di.data, path, u->requires_mounts_for) {
+ char prefix[strlen(path) + 1];
+
+ PATH_FOREACH_PREFIX_MORE(prefix, path) {
+ _cleanup_free_ char *p = NULL;
+ Unit *m;
+
+ r = unit_name_from_path(prefix, ".mount", &p);
+ if (r == -EINVAL)
+ continue; /* If the path cannot be converted to a mount unit name, then it's
+ * not manageable as a unit by systemd, and hence we don't need a
+ * dependency on it. Let's thus silently ignore the issue. */
+ if (r < 0)
+ return r;
+
+ m = manager_get_unit(u->manager, p);
+ if (!m) {
+ /* Make sure to load the mount unit if it exists. If so the dependencies on
+ * this unit will be added later during the loading of the mount unit. */
+ (void) manager_load_unit_prepare(u->manager, p, NULL, NULL, &m);
+ continue;
+ }
+ if (m == u)
+ continue;
+
+ if (m->load_state != UNIT_LOADED)
+ continue;
+
+ r = unit_add_dependency(u, UNIT_AFTER, m, true, di.origin_mask);
+ if (r < 0)
+ return r;
+ changed = changed || r > 0;
+
+ if (m->fragment_path) {
+ r = unit_add_dependency(u, UNIT_REQUIRES, m, true, di.origin_mask);
+ if (r < 0)
+ return r;
+ changed = changed || r > 0;
+ }
+ }
+ }
+
+ return changed;
+}
+
+static int unit_add_oomd_dependencies(Unit *u) {
+ CGroupContext *c;
+ CGroupMask mask;
+ int r;
+
+ assert(u);
+
+ if (!u->default_dependencies)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ bool wants_oomd = c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL;
+ if (!wants_oomd)
+ return 0;
+
+ if (!cg_all_unified())
+ return 0;
+
+ r = cg_mask_supported(&mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine supported controllers: %m");
+
+ if (!FLAGS_SET(mask, CGROUP_MASK_MEMORY))
+ return 0;
+
+ return unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE);
+}
+
+static int unit_add_startup_units(Unit *u) {
+ if (!unit_has_startup_cgroup_constraints(u))
+ return 0;
+
+ return set_ensure_put(&u->manager->startup_units, NULL, u);
+}
+
+static int unit_validate_on_failure_job_mode(
+ Unit *u,
+ const char *job_mode_setting,
+ JobMode job_mode,
+ const char *dependency_name,
+ UnitDependencyAtom atom) {
+
+ Unit *other, *found = NULL;
+
+ if (job_mode != JOB_ISOLATE)
+ return 0;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ if (!found)
+ found = other;
+ else if (found != other)
+ return log_unit_error_errno(
+ u, SYNTHETIC_ERRNO(ENOEXEC),
+ "More than one %s dependencies specified but %sisolate set. Refusing.",
+ dependency_name, job_mode_setting);
+ }
+
+ return 0;
+}
+
+int unit_load(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->in_load_queue) {
+ LIST_REMOVE(load_queue, u->manager->load_queue, u);
+ u->in_load_queue = false;
+ }
+
+ if (u->type == _UNIT_TYPE_INVALID)
+ return -EINVAL;
+
+ if (u->load_state != UNIT_STUB)
+ return 0;
+
+ if (u->transient_file) {
+ /* Finalize transient file: if this is a transient unit file, as soon as we reach unit_load() the setup
+ * is complete, hence let's synchronize the unit file we just wrote to disk. */
+
+ r = fflush_and_check(u->transient_file);
+ if (r < 0)
+ goto fail;
+
+ u->transient_file = safe_fclose(u->transient_file);
+ u->fragment_mtime = now(CLOCK_REALTIME);
+ }
+
+ r = UNIT_VTABLE(u)->load(u);
+ if (r < 0)
+ goto fail;
+
+ assert(u->load_state != UNIT_STUB);
+
+ if (u->load_state == UNIT_LOADED) {
+ unit_add_to_target_deps_queue(u);
+
+ r = unit_add_slice_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_mount_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_oomd_dependencies(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_add_startup_units(u);
+ if (r < 0)
+ goto fail;
+
+ r = unit_validate_on_failure_job_mode(u, "OnSuccessJobMode=", u->on_success_job_mode, "OnSuccess=", UNIT_ATOM_ON_SUCCESS);
+ if (r < 0)
+ goto fail;
+
+ r = unit_validate_on_failure_job_mode(u, "OnFailureJobMode=", u->on_failure_job_mode, "OnFailure=", UNIT_ATOM_ON_FAILURE);
+ if (r < 0)
+ goto fail;
+
+ if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout)
+ log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect.");
+
+ /* We finished loading, let's ensure our parents recalculate the members mask */
+ unit_invalidate_cgroup_members_masks(u);
+ }
+
+ assert((u->load_state != UNIT_MERGED) == !u->merged_into);
+
+ unit_add_to_dbus_queue(unit_follow_merge(u));
+ unit_add_to_gc_queue(u);
+ (void) manager_varlink_send_managed_oom_update(u);
+
+ return 0;
+
+fail:
+ /* We convert ENOEXEC errors to the UNIT_BAD_SETTING load state here. Configuration parsing code
+ * should hence return ENOEXEC to ensure units are placed in this state after loading. */
+
+ u->load_state = u->load_state == UNIT_STUB ? UNIT_NOT_FOUND :
+ r == -ENOEXEC ? UNIT_BAD_SETTING :
+ UNIT_ERROR;
+ u->load_error = r;
+
+ /* Record the timestamp on the cache, so that if the cache gets updated between now and the next time
+ * an attempt is made to load this unit, we know we need to check again. */
+ if (u->load_state == UNIT_NOT_FOUND)
+ u->fragment_not_found_timestamp_hash = u->manager->unit_cache_timestamp_hash;
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_gc_queue(u);
+
+ return log_unit_debug_errno(u, r, "Failed to load configuration: %m");
+}
+
+_printf_(7, 8)
+static int log_unit_internal(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) {
+ Unit *u = userdata;
+ va_list ap;
+ int r;
+
+ if (u && !unit_log_level_test(u, level))
+ return -ERRNO_VALUE(error);
+
+ va_start(ap, format);
+ if (u)
+ r = log_object_internalv(level, error, file, line, func,
+ u->manager->unit_log_field,
+ u->id,
+ u->manager->invocation_log_field,
+ u->invocation_id_string,
+ format, ap);
+ else
+ r = log_internalv(level, error, file, line, func, format, ap);
+ va_end(ap);
+
+ return r;
+}
+
+static bool unit_test_condition(Unit *u) {
+ _cleanup_strv_free_ char **env = NULL;
+ int r;
+
+ assert(u);
+
+ dual_timestamp_now(&u->condition_timestamp);
+
+ r = manager_get_effective_environment(u->manager, &env);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+ u->condition_result = true;
+ } else
+ u->condition_result = condition_test_list(
+ u->conditions,
+ env,
+ condition_type_to_string,
+ log_unit_internal,
+ u);
+
+ unit_add_to_dbus_queue(u);
+ return u->condition_result;
+}
+
+static bool unit_test_assert(Unit *u) {
+ _cleanup_strv_free_ char **env = NULL;
+ int r;
+
+ assert(u);
+
+ dual_timestamp_now(&u->assert_timestamp);
+
+ r = manager_get_effective_environment(u->manager, &env);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to determine effective environment: %m");
+ u->assert_result = CONDITION_ERROR;
+ } else
+ u->assert_result = condition_test_list(
+ u->asserts,
+ env,
+ assert_type_to_string,
+ log_unit_internal,
+ u);
+
+ unit_add_to_dbus_queue(u);
+ return u->assert_result;
+}
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) {
+ if (log_get_show_color()) {
+ if (u->manager->status_unit_format == STATUS_UNIT_FORMAT_COMBINED && strchr(ident, ' '))
+ ident = strjoina(ANSI_HIGHLIGHT, u->id, ANSI_NORMAL, " - ", u->description);
+ else
+ ident = strjoina(ANSI_HIGHLIGHT, ident, ANSI_NORMAL);
+ }
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ manager_status_printf(u->manager, status_type, status, format, ident);
+ REENABLE_WARNING;
+}
+
+int unit_test_start_limit(Unit *u) {
+ const char *reason;
+
+ assert(u);
+
+ if (ratelimit_below(&u->start_ratelimit)) {
+ u->start_limit_hit = false;
+ return 0;
+ }
+
+ log_unit_warning(u, "Start request repeated too quickly.");
+ u->start_limit_hit = true;
+
+ reason = strjoina("unit ", u->id, " failed");
+
+ emergency_action(u->manager, u->start_limit_action,
+ EMERGENCY_ACTION_IS_WATCHDOG|EMERGENCY_ACTION_WARN,
+ u->reboot_arg, -1, reason);
+
+ return -ECANCELED;
+}
+
+static bool unit_verify_deps(Unit *u) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks whether all BindsTo= dependencies of this unit are fulfilled — if they are also combined
+ * with After=. We do not check Requires= or Requisite= here as they only should have an effect on
+ * the job processing, but do not have any effect afterwards. We don't check BindsTo= dependencies
+ * that are not used in conjunction with After= as for them any such check would make things entirely
+ * racy. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other))
+ continue;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+ log_unit_notice(u, "Bound to unit %s, but unit isn't active.", other->id);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Errors that aren't really errors:
+ * -EALREADY: Unit is already started.
+ * -ECOMM: Condition failed
+ * -EAGAIN: An operation is already in progress. Retry later.
+ *
+ * Errors that are real errors:
+ * -EBADR: This unit type does not support starting.
+ * -ECANCELED: Start limit hit, too many requests for now
+ * -EPROTO: Assert failed
+ * -EINVAL: Unit not loaded
+ * -EOPNOTSUPP: Unit type not supported
+ * -ENOLINK: The necessary dependencies are not fulfilled.
+ * -ESTALE: This unit has been started before and can't be started a second time
+ * -ENOENT: This is a triggering unit and unit to trigger is not loaded
+ */
+int unit_start(Unit *u, ActivationDetails *details) {
+ UnitActiveState state;
+ Unit *following;
+ int r;
+
+ assert(u);
+
+ /* Let's hold off running start jobs for mount units when /proc/self/mountinfo monitor is ratelimited. */
+ if (UNIT_VTABLE(u)->subsystem_ratelimited) {
+ r = UNIT_VTABLE(u)->subsystem_ratelimited(u->manager);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return -EAGAIN;
+ }
+
+ /* If this is already started, then this will succeed. Note that this will even succeed if this unit
+ * is not startable by the user. This is relied on to detect when we need to wait for units and when
+ * waiting is finished. */
+ state = unit_active_state(u);
+ if (UNIT_IS_ACTIVE_OR_RELOADING(state))
+ return -EALREADY;
+ if (state == UNIT_MAINTENANCE)
+ return -EAGAIN;
+
+ /* Units that aren't loaded cannot be started */
+ if (u->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ /* Refuse starting scope units more than once */
+ if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_enter_timestamp))
+ return -ESTALE;
+
+ /* If the conditions were unmet, don't do anything at all. If we already are activating this call might
+ * still be useful to speed up activation in case there is some hold-off time, but we don't want to
+ * recheck the condition in that case. */
+ if (state != UNIT_ACTIVATING &&
+ !unit_test_condition(u))
+ return log_unit_debug_errno(u, SYNTHETIC_ERRNO(ECOMM), "Starting requested but condition not met. Not starting unit.");
+
+ /* If the asserts failed, fail the entire job */
+ if (state != UNIT_ACTIVATING &&
+ !unit_test_assert(u))
+ return log_unit_notice_errno(u, SYNTHETIC_ERRNO(EPROTO), "Starting requested but asserts failed.");
+
+ /* Units of types that aren't supported cannot be started. Note that we do this test only after the
+ * condition checks, so that we rather return condition check errors (which are usually not
+ * considered a true failure) than "not supported" errors (which are considered a failure).
+ */
+ if (!unit_type_supported(u->type))
+ return -EOPNOTSUPP;
+
+ /* Let's make sure that the deps really are in order before we start this. Normally the job engine
+ * should have taken care of this already, but let's check this here again. After all, our
+ * dependencies might not be in effect anymore, due to a reload or due to an unmet condition. */
+ if (!unit_verify_deps(u))
+ return -ENOLINK;
+
+ /* Forward to the main object, if we aren't it. */
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting start request from %s to %s.", u->id, following->id);
+ return unit_start(following, details);
+ }
+
+ /* Check our ability to start early so that failure conditions don't cause us to enter a busy loop. */
+ if (UNIT_VTABLE(u)->can_start) {
+ r = UNIT_VTABLE(u)->can_start(u);
+ if (r < 0)
+ return r;
+ }
+
+ /* If it is stopped, but we cannot start it, then fail */
+ if (!UNIT_VTABLE(u)->start)
+ return -EBADR;
+
+ /* We don't suppress calls to ->start() here when we are already starting, to allow this request to
+ * be used as a "hurry up" call, for example when the unit is in some "auto restart" state where it
+ * waits for a holdoff timer to elapse before it will start again. */
+
+ unit_add_to_dbus_queue(u);
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ if (!u->activation_details) /* Older details object wins */
+ u->activation_details = activation_details_ref(details);
+
+ return UNIT_VTABLE(u)->start(u);
+}
+
+bool unit_can_start(Unit *u) {
+ assert(u);
+
+ if (u->load_state != UNIT_LOADED)
+ return false;
+
+ if (!unit_type_supported(u->type))
+ return false;
+
+ /* Scope units may be started only once */
+ if (UNIT_VTABLE(u)->once_only && dual_timestamp_is_set(&u->inactive_exit_timestamp))
+ return false;
+
+ return !!UNIT_VTABLE(u)->start;
+}
+
+bool unit_can_isolate(Unit *u) {
+ assert(u);
+
+ return unit_can_start(u) &&
+ u->allow_isolate;
+}
+
+/* Errors:
+ * -EBADR: This unit type does not support stopping.
+ * -EALREADY: Unit is already stopped.
+ * -EAGAIN: An operation is already in progress. Retry later.
+ */
+int unit_stop(Unit *u) {
+ UnitActiveState state;
+ Unit *following;
+
+ assert(u);
+
+ state = unit_active_state(u);
+ if (UNIT_IS_INACTIVE_OR_FAILED(state))
+ return -EALREADY;
+
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting stop request from %s to %s.", u->id, following->id);
+ return unit_stop(following);
+ }
+
+ if (!UNIT_VTABLE(u)->stop)
+ return -EBADR;
+
+ unit_add_to_dbus_queue(u);
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ return UNIT_VTABLE(u)->stop(u);
+}
+
+bool unit_can_stop(Unit *u) {
+ assert(u);
+
+ /* Note: if we return true here, it does not mean that the unit may be successfully stopped.
+ * Extrinsic units follow external state and they may stop following external state changes
+ * (hence we return true here), but an attempt to do this through the manager will fail. */
+
+ if (!unit_type_supported(u->type))
+ return false;
+
+ if (u->perpetual)
+ return false;
+
+ return !!UNIT_VTABLE(u)->stop;
+}
+
+/* Errors:
+ * -EBADR: This unit type does not support reloading.
+ * -ENOEXEC: Unit is not started.
+ * -EAGAIN: An operation is already in progress. Retry later.
+ */
+int unit_reload(Unit *u) {
+ UnitActiveState state;
+ Unit *following;
+
+ assert(u);
+
+ if (u->load_state != UNIT_LOADED)
+ return -EINVAL;
+
+ if (!unit_can_reload(u))
+ return -EBADR;
+
+ state = unit_active_state(u);
+ if (state == UNIT_RELOADING)
+ return -EAGAIN;
+
+ if (state != UNIT_ACTIVE)
+ return log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "Unit cannot be reloaded because it is inactive.");
+
+ following = unit_following(u);
+ if (following) {
+ log_unit_debug(u, "Redirecting reload request from %s to %s.", u->id, following->id);
+ return unit_reload(following);
+ }
+
+ unit_add_to_dbus_queue(u);
+
+ if (!UNIT_VTABLE(u)->reload) {
+ /* Unit doesn't have a reload function, but we need to propagate the reload anyway */
+ unit_notify(u, unit_active_state(u), unit_active_state(u), /* reload_success = */ true);
+ return 0;
+ }
+
+ unit_cgroup_freezer_action(u, FREEZER_THAW);
+
+ return UNIT_VTABLE(u)->reload(u);
+}
+
+bool unit_can_reload(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->can_reload)
+ return UNIT_VTABLE(u)->can_reload(u);
+
+ if (unit_has_dependency(u, UNIT_ATOM_PROPAGATES_RELOAD_TO, NULL))
+ return true;
+
+ return UNIT_VTABLE(u)->reload;
+}
+
+bool unit_is_unneeded(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ if (!u->stop_when_unneeded)
+ return false;
+
+ /* Don't clean up while the unit is transitioning or is even inactive. */
+ if (unit_active_state(u) != UNIT_ACTIVE)
+ return false;
+ if (u->job)
+ return false;
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_PINS_STOP_WHEN_UNNEEDED) {
+ /* If a dependent unit has a job queued, is active or transitioning, or is marked for
+ * restart, then don't clean this one up. */
+
+ if (other->job)
+ return false;
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other)))
+ return false;
+
+ if (unit_will_restart(other))
+ return false;
+ }
+
+ return true;
+}
+
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks if the unit needs to be started because it currently is not running, but some other unit
+ * that is active declared an Uphold= dependencies on it */
+
+ if (!UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)) || u->job) {
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_START_STEADILY) {
+ if (other->job)
+ continue;
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(unit_active_state(other))) {
+ if (ret_culprit)
+ *ret_culprit = other;
+ return true;
+ }
+ }
+
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+}
+
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit) {
+ Unit *other;
+
+ assert(u);
+
+ /* Checks whether this unit is bound to another unit that is inactive, i.e. whether we should stop
+ * because the other unit is down. */
+
+ if (unit_active_state(u) != UNIT_ACTIVE || u->job) {
+ /* Don't clean up while the unit is transitioning or is even inactive. */
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+ }
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_CANNOT_BE_ACTIVE_WITHOUT) {
+ if (other->job)
+ continue;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(other))) {
+ if (ret_culprit)
+ *ret_culprit = other;
+
+ return true;
+ }
+ }
+
+ if (ret_culprit)
+ *ret_culprit = NULL;
+ return false;
+}
+
+static void check_unneeded_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes StopWhenUnneeded= behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_STOP_WHEN_UNNEEDED_QUEUE)
+ unit_submit_to_stop_when_unneeded_queue(other);
+}
+
+static void check_uphold_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes Uphold= behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_START_WHEN_UPHELD_QUEUE)
+ unit_submit_to_start_when_upheld_queue(other);
+}
+
+static void check_bound_by_dependencies(Unit *u) {
+ Unit *other;
+ assert(u);
+
+ /* Add all units this unit depends on to the queue that processes BindsTo= stop behaviour. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_ADD_CANNOT_BE_ACTIVE_WITHOUT_QUEUE)
+ unit_submit_to_stop_when_bound_queue(other);
+}
+
+static void retroactively_start_dependencies(Unit *u) {
+ Unit *other;
+
+ assert(u);
+ assert(UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)));
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_REPLACE) /* Requires= + BindsTo= */
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+ !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_START, other, JOB_REPLACE, NULL, NULL, NULL);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_START_FAIL) /* Wants= */
+ if (!unit_has_dependency(u, UNIT_ATOM_AFTER, other) &&
+ !UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_START, other, JOB_FAIL, NULL, NULL, NULL);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_START) /* Conflicts= (and inverse) */
+ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+static void retroactively_stop_dependencies(Unit *u) {
+ Unit *other;
+
+ assert(u);
+ assert(UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)));
+
+ /* Pull down units which are bound to us recursively if enabled */
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_RETROACTIVE_STOP_ON_STOP) /* BoundBy= */
+ if (!UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(other)))
+ manager_add_job(u->manager, JOB_STOP, other, JOB_REPLACE, NULL, NULL, NULL);
+}
+
+void unit_start_on_failure(
+ Unit *u,
+ const char *dependency_name,
+ UnitDependencyAtom atom,
+ JobMode job_mode) {
+
+ int n_jobs = -1;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(dependency_name);
+ assert(IN_SET(atom, UNIT_ATOM_ON_SUCCESS, UNIT_ATOM_ON_FAILURE));
+
+ /* Act on OnFailure= and OnSuccess= dependencies */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ if (n_jobs < 0) {
+ log_unit_info(u, "Triggering %s dependencies.", dependency_name);
+ n_jobs = 0;
+ }
+
+ r = manager_add_job(u->manager, JOB_START, other, job_mode, NULL, &error, NULL);
+ if (r < 0)
+ log_unit_warning_errno(
+ u, r, "Failed to enqueue %s job, ignoring: %s",
+ dependency_name, bus_error_message(&error, r));
+ n_jobs ++;
+ }
+
+ if (n_jobs >= 0)
+ log_unit_debug(u, "Triggering %s dependencies done (%i %s).",
+ dependency_name, n_jobs, n_jobs == 1 ? "job" : "jobs");
+}
+
+void unit_trigger_notify(Unit *u) {
+ Unit *other;
+
+ assert(u);
+
+ UNIT_FOREACH_DEPENDENCY(other, u, UNIT_ATOM_TRIGGERED_BY)
+ if (UNIT_VTABLE(other)->trigger_notify)
+ UNIT_VTABLE(other)->trigger_notify(other, u);
+}
+
+static int raise_level(int log_level, bool condition_info, bool condition_notice) {
+ if (condition_notice && log_level > LOG_NOTICE)
+ return LOG_NOTICE;
+ if (condition_info && log_level > LOG_INFO)
+ return LOG_INFO;
+ return log_level;
+}
+
+static int unit_log_resources(Unit *u) {
+ struct iovec iovec[1 + 2 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + _CGROUP_IO_ACCOUNTING_METRIC_MAX + 4];
+ bool any_traffic = false, have_ip_accounting = false, any_io = false, have_io_accounting = false;
+ _cleanup_free_ char *igress = NULL, *egress = NULL, *rr = NULL, *wr = NULL;
+ int log_level = LOG_DEBUG; /* May be raised if resources consumed over a threshold */
+ size_t n_message_parts = 0, n_iovec = 0;
+ char* message_parts[1 + 2 + 2 + 2 + 1], *t;
+ nsec_t nsec = NSEC_INFINITY;
+ uint64_t memory_peak = UINT64_MAX, memory_swap_peak = UINT64_MAX;
+ int r;
+ const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES",
+ [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS",
+ [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES",
+ [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS",
+ };
+ const char* const io_fields[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
+ [CGROUP_IO_READ_BYTES] = "IO_METRIC_READ_BYTES",
+ [CGROUP_IO_WRITE_BYTES] = "IO_METRIC_WRITE_BYTES",
+ [CGROUP_IO_READ_OPERATIONS] = "IO_METRIC_READ_OPERATIONS",
+ [CGROUP_IO_WRITE_OPERATIONS] = "IO_METRIC_WRITE_OPERATIONS",
+ };
+
+ assert(u);
+
+ /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource
+ * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced
+ * information and the complete data in structured fields. */
+
+ (void) unit_get_cpu_usage(u, &nsec);
+ if (nsec != NSEC_INFINITY) {
+ /* Format the CPU time for inclusion in the structured log message */
+ if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the CPU time for inclusion in the human language message string */
+ t = strjoin("consumed ", FORMAT_TIMESPAN(nsec / NSEC_PER_USEC, USEC_PER_MSEC), " CPU time");
+ if (!t) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = t;
+
+ log_level = raise_level(log_level,
+ nsec > MENTIONWORTHY_CPU_NSEC,
+ nsec > NOTICEWORTHY_CPU_NSEC);
+ }
+
+ (void) unit_get_memory_accounting(u, CGROUP_MEMORY_PEAK, &memory_peak);
+ if (memory_peak != UINT64_MAX) {
+ /* Format peak memory for inclusion in the structured log message */
+ if (asprintf(&t, "MEMORY_PEAK=%" PRIu64, memory_peak) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format peak memory for inclusion in the human language message string */
+ t = strjoin(FORMAT_BYTES(memory_peak), " memory peak");
+ if (!t) {
+ r = log_oom();
+ goto finish;
+ }
+ message_parts[n_message_parts++] = t;
+ }
+
+ (void) unit_get_memory_accounting(u, CGROUP_MEMORY_SWAP_PEAK, &memory_swap_peak);
+ if (memory_swap_peak != UINT64_MAX) {
+ /* Format peak swap memory for inclusion in the structured log message */
+ if (asprintf(&t, "MEMORY_SWAP_PEAK=%" PRIu64, memory_swap_peak) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format peak swap memory for inclusion in the human language message string */
+ t = strjoin(FORMAT_BYTES(memory_swap_peak), " memory swap peak");
+ if (!t) {
+ r = log_oom();
+ goto finish;
+ }
+ message_parts[n_message_parts++] = t;
+ }
+
+ for (CGroupIOAccountingMetric k = 0; k < _CGROUP_IO_ACCOUNTING_METRIC_MAX; k++) {
+ uint64_t value = UINT64_MAX;
+
+ assert(io_fields[k]);
+
+ (void) unit_get_io_accounting(u, k, k > 0, &value);
+ if (value == UINT64_MAX)
+ continue;
+
+ have_io_accounting = true;
+ if (value > 0)
+ any_io = true;
+
+ /* Format IO accounting data for inclusion in the structured log message */
+ if (asprintf(&t, "%s=%" PRIu64, io_fields[k], value) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the IO accounting data for inclusion in the human language message string, but only
+ * for the bytes counters (and not for the operations counters) */
+ if (k == CGROUP_IO_READ_BYTES) {
+ assert(!rr);
+ rr = strjoin("read ", strna(FORMAT_BYTES(value)), " from disk");
+ if (!rr) {
+ r = log_oom();
+ goto finish;
+ }
+ } else if (k == CGROUP_IO_WRITE_BYTES) {
+ assert(!wr);
+ wr = strjoin("written ", strna(FORMAT_BYTES(value)), " to disk");
+ if (!wr) {
+ r = log_oom();
+ goto finish;
+ }
+ }
+
+ if (IN_SET(k, CGROUP_IO_READ_BYTES, CGROUP_IO_WRITE_BYTES))
+ log_level = raise_level(log_level,
+ value > MENTIONWORTHY_IO_BYTES,
+ value > NOTICEWORTHY_IO_BYTES);
+ }
+
+ if (have_io_accounting) {
+ if (any_io) {
+ if (rr)
+ message_parts[n_message_parts++] = TAKE_PTR(rr);
+ if (wr)
+ message_parts[n_message_parts++] = TAKE_PTR(wr);
+
+ } else {
+ char *k;
+
+ k = strdup("no IO");
+ if (!k) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = k;
+ }
+ }
+
+ for (CGroupIPAccountingMetric m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) {
+ uint64_t value = UINT64_MAX;
+
+ assert(ip_fields[m]);
+
+ (void) unit_get_ip_accounting(u, m, &value);
+ if (value == UINT64_MAX)
+ continue;
+
+ have_ip_accounting = true;
+ if (value > 0)
+ any_traffic = true;
+
+ /* Format IP accounting data for inclusion in the structured log message */
+ if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+ iovec[n_iovec++] = IOVEC_MAKE_STRING(t);
+
+ /* Format the IP accounting data for inclusion in the human language message string, but only for the
+ * bytes counters (and not for the packets counters) */
+ if (m == CGROUP_IP_INGRESS_BYTES) {
+ assert(!igress);
+ igress = strjoin("received ", strna(FORMAT_BYTES(value)), " IP traffic");
+ if (!igress) {
+ r = log_oom();
+ goto finish;
+ }
+ } else if (m == CGROUP_IP_EGRESS_BYTES) {
+ assert(!egress);
+ egress = strjoin("sent ", strna(FORMAT_BYTES(value)), " IP traffic");
+ if (!egress) {
+ r = log_oom();
+ goto finish;
+ }
+ }
+
+ if (IN_SET(m, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
+ log_level = raise_level(log_level,
+ value > MENTIONWORTHY_IP_BYTES,
+ value > NOTICEWORTHY_IP_BYTES);
+ }
+
+ /* This check is here because it is the earliest point following all possible log_level assignments. If
+ * log_level is assigned anywhere after this point, move this check. */
+ if (!unit_log_level_test(u, log_level)) {
+ r = 0;
+ goto finish;
+ }
+
+ if (have_ip_accounting) {
+ if (any_traffic) {
+ if (igress)
+ message_parts[n_message_parts++] = TAKE_PTR(igress);
+ if (egress)
+ message_parts[n_message_parts++] = TAKE_PTR(egress);
+
+ } else {
+ char *k;
+
+ k = strdup("no IP traffic");
+ if (!k) {
+ r = log_oom();
+ goto finish;
+ }
+
+ message_parts[n_message_parts++] = k;
+ }
+ }
+
+ /* Is there any accounting data available at all? */
+ if (n_iovec == 0) {
+ r = 0;
+ goto finish;
+ }
+
+ if (n_message_parts == 0)
+ t = strjoina("MESSAGE=", u->id, ": Completed.");
+ else {
+ _cleanup_free_ char *joined = NULL;
+
+ message_parts[n_message_parts] = NULL;
+
+ joined = strv_join(message_parts, ", ");
+ if (!joined) {
+ r = log_oom();
+ goto finish;
+ }
+
+ joined[0] = ascii_toupper(joined[0]);
+ t = strjoina("MESSAGE=", u->id, ": ", joined, ".");
+ }
+
+ /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them,
+ * and hence don't increase n_iovec for them */
+ iovec[n_iovec] = IOVEC_MAKE_STRING(t);
+ iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR);
+
+ t = strjoina(u->manager->unit_log_field, u->id);
+ iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t);
+
+ t = strjoina(u->manager->invocation_log_field, u->invocation_id_string);
+ iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t);
+
+ log_unit_struct_iovec(u, log_level, iovec, n_iovec + 4);
+ r = 0;
+
+finish:
+ free_many_charp(message_parts, n_message_parts);
+
+ for (size_t i = 0; i < n_iovec; i++)
+ free(iovec[i].iov_base);
+
+ return r;
+
+}
+
+static void unit_update_on_console(Unit *u) {
+ bool b;
+
+ assert(u);
+
+ b = unit_needs_console(u);
+ if (u->on_console == b)
+ return;
+
+ u->on_console = b;
+ if (b)
+ manager_ref_console(u->manager);
+ else
+ manager_unref_console(u->manager);
+}
+
+static void unit_emit_audit_start(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->audit_start_message_type <= 0)
+ return;
+
+ /* Write audit record if we have just finished starting up */
+ manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ true);
+ u->in_audit = true;
+}
+
+static void unit_emit_audit_stop(Unit *u, UnitActiveState state) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->audit_start_message_type <= 0)
+ return;
+
+ if (u->in_audit) {
+ /* Write audit record if we have just finished shutting down */
+ manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ state == UNIT_INACTIVE);
+ u->in_audit = false;
+ } else {
+ /* Hmm, if there was no start record written write it now, so that we always have a nice pair */
+ manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_start_message_type, /* success= */ state == UNIT_INACTIVE);
+
+ if (state == UNIT_INACTIVE)
+ manager_send_unit_audit(u->manager, u, UNIT_VTABLE(u)->audit_stop_message_type, /* success= */ true);
+ }
+}
+
+static bool unit_process_job(Job *j, UnitActiveState ns, bool reload_success) {
+ bool unexpected = false;
+ JobResult result;
+
+ assert(j);
+
+ if (j->state == JOB_WAITING)
+ /* So we reached a different state for this job. Let's see if we can run it now if it failed previously
+ * due to EAGAIN. */
+ job_add_to_run_queue(j);
+
+ /* Let's check whether the unit's new state constitutes a finished job, or maybe contradicts a running job and
+ * hence needs to invalidate jobs. */
+
+ switch (j->type) {
+
+ case JOB_START:
+ case JOB_VERIFY_ACTIVE:
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (j->state == JOB_RUNNING && ns != UNIT_ACTIVATING) {
+ unexpected = true;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ if (ns == UNIT_FAILED)
+ result = JOB_FAILED;
+ else
+ result = JOB_DONE;
+
+ job_finish_and_invalidate(j, result, true, false);
+ }
+ }
+
+ break;
+
+ case JOB_RELOAD:
+ case JOB_RELOAD_OR_START:
+ case JOB_TRY_RELOAD:
+
+ if (j->state == JOB_RUNNING) {
+ if (ns == UNIT_ACTIVE)
+ job_finish_and_invalidate(j, reload_success ? JOB_DONE : JOB_FAILED, true, false);
+ else if (!IN_SET(ns, UNIT_ACTIVATING, UNIT_RELOADING)) {
+ unexpected = true;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+ job_finish_and_invalidate(j, ns == UNIT_FAILED ? JOB_FAILED : JOB_DONE, true, false);
+ }
+ }
+
+ break;
+
+ case JOB_STOP:
+ case JOB_RESTART:
+ case JOB_TRY_RESTART:
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns))
+ job_finish_and_invalidate(j, JOB_DONE, true, false);
+ else if (j->state == JOB_RUNNING && ns != UNIT_DEACTIVATING) {
+ unexpected = true;
+ job_finish_and_invalidate(j, JOB_FAILED, true, false);
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return unexpected;
+}
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success) {
+ const char *reason;
+ Manager *m;
+
+ assert(u);
+ assert(os < _UNIT_ACTIVE_STATE_MAX);
+ assert(ns < _UNIT_ACTIVE_STATE_MAX);
+
+ /* Note that this is called for all low-level state changes, even if they might map to the same high-level
+ * UnitActiveState! That means that ns == os is an expected behavior here. For example: if a mount point is
+ * remounted this function will be called too! */
+
+ m = u->manager;
+
+ /* Let's enqueue the change signal early. In case this unit has a job associated we want that this unit is in
+ * the bus queue, so that any job change signal queued will force out the unit change signal first. */
+ unit_add_to_dbus_queue(u);
+
+ /* Update systemd-oomd on the property/state change */
+ if (os != ns) {
+ /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop
+ * monitoring.
+ * Also send an update whenever the unit goes active; this is to handle a case where an override file
+ * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to
+ * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't
+ * have the information on the property. Thus, indiscriminately send an update. */
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns) || UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ (void) manager_varlink_send_managed_oom_update(u);
+ }
+
+ /* Update timestamps for state changes */
+ if (!MANAGER_IS_RELOADING(m)) {
+ dual_timestamp_now(&u->state_change_timestamp);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(os) && !UNIT_IS_INACTIVE_OR_FAILED(ns))
+ u->inactive_exit_timestamp = u->state_change_timestamp;
+ else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_INACTIVE_OR_FAILED(ns))
+ u->inactive_enter_timestamp = u->state_change_timestamp;
+
+ if (!UNIT_IS_ACTIVE_OR_RELOADING(os) && UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ u->active_enter_timestamp = u->state_change_timestamp;
+ else if (UNIT_IS_ACTIVE_OR_RELOADING(os) && !UNIT_IS_ACTIVE_OR_RELOADING(ns))
+ u->active_exit_timestamp = u->state_change_timestamp;
+ }
+
+ /* Keep track of failed units */
+ (void) manager_update_failed_units(m, u, ns == UNIT_FAILED);
+
+ /* Make sure the cgroup and state files are always removed when we become inactive */
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ SET_FLAG(u->markers,
+ (1u << UNIT_MARKER_NEEDS_RELOAD)|(1u << UNIT_MARKER_NEEDS_RESTART),
+ false);
+ unit_prune_cgroup(u);
+ unit_unlink_state_files(u);
+ } else if (ns != os && ns == UNIT_RELOADING)
+ SET_FLAG(u->markers, 1u << UNIT_MARKER_NEEDS_RELOAD, false);
+
+ unit_update_on_console(u);
+
+ if (!MANAGER_IS_RELOADING(m)) {
+ bool unexpected;
+
+ /* Let's propagate state changes to the job */
+ if (u->job)
+ unexpected = unit_process_job(u->job, ns, reload_success);
+ else
+ unexpected = true;
+
+ /* If this state change happened without being requested by a job, then let's retroactively start or
+ * stop dependencies. We skip that step when deserializing, since we don't want to create any
+ * additional jobs just because something is already activated. */
+
+ if (unexpected) {
+ if (UNIT_IS_INACTIVE_OR_FAILED(os) && UNIT_IS_ACTIVE_OR_ACTIVATING(ns))
+ retroactively_start_dependencies(u);
+ else if (UNIT_IS_ACTIVE_OR_ACTIVATING(os) && UNIT_IS_INACTIVE_OR_DEACTIVATING(ns))
+ retroactively_stop_dependencies(u);
+ }
+
+ if (ns != os && ns == UNIT_FAILED) {
+ log_unit_debug(u, "Unit entered failed state.");
+ unit_start_on_failure(u, "OnFailure=", UNIT_ATOM_ON_FAILURE, u->on_failure_job_mode);
+ }
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns) && !UNIT_IS_ACTIVE_OR_RELOADING(os)) {
+ /* This unit just finished starting up */
+
+ unit_emit_audit_start(u);
+ manager_send_unit_plymouth(m, u);
+ }
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns) && !UNIT_IS_INACTIVE_OR_FAILED(os)) {
+ /* This unit just stopped/failed. */
+
+ unit_emit_audit_stop(u, ns);
+ unit_log_resources(u);
+ }
+
+ if (ns == UNIT_INACTIVE && !IN_SET(os, UNIT_FAILED, UNIT_INACTIVE, UNIT_MAINTENANCE))
+ unit_start_on_failure(u, "OnSuccess=", UNIT_ATOM_ON_SUCCESS, u->on_success_job_mode);
+ }
+
+ manager_recheck_journal(m);
+ manager_recheck_dbus(m);
+
+ unit_trigger_notify(u);
+
+ if (!MANAGER_IS_RELOADING(m)) {
+ if (os != UNIT_FAILED && ns == UNIT_FAILED) {
+ reason = strjoina("unit ", u->id, " failed");
+ emergency_action(m, u->failure_action, 0, u->reboot_arg, unit_failure_action_exit_status(u), reason);
+ } else if (!UNIT_IS_INACTIVE_OR_FAILED(os) && ns == UNIT_INACTIVE) {
+ reason = strjoina("unit ", u->id, " succeeded");
+ emergency_action(m, u->success_action, 0, u->reboot_arg, unit_success_action_exit_status(u), reason);
+ }
+ }
+
+ /* And now, add the unit or depending units to various queues that will act on the new situation if
+ * needed. These queues generally check for continuous state changes rather than events (like most of
+ * the state propagation above), and do work deferred instead of instantly, since they typically
+ * don't want to run during reloading, and usually involve checking combined state of multiple units
+ * at once. */
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns)) {
+ /* Stop unneeded units and bound-by units regardless if going down was expected or not */
+ check_unneeded_dependencies(u);
+ check_bound_by_dependencies(u);
+
+ /* Maybe someone wants us to remain up? */
+ unit_submit_to_start_when_upheld_queue(u);
+
+ /* Maybe the unit should be GC'ed now? */
+ unit_add_to_gc_queue(u);
+
+ /* Maybe we can release some resources now? */
+ unit_submit_to_release_resources_queue(u);
+ }
+
+ if (UNIT_IS_ACTIVE_OR_RELOADING(ns)) {
+ /* Start uphold units regardless if going up was expected or not */
+ check_uphold_dependencies(u);
+
+ /* Maybe we finished startup and are now ready for being stopped because unneeded? */
+ unit_submit_to_stop_when_unneeded_queue(u);
+
+ /* Maybe we finished startup, but something we needed has vanished? Let's die then. (This happens
+ * when something BindsTo= to a Type=oneshot unit, as these units go directly from starting to
+ * inactive, without ever entering started.) */
+ unit_submit_to_stop_when_bound_queue(u);
+ }
+}
+
+int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive) {
+ _cleanup_(pidref_freep) PidRef *pid_dup = NULL;
+ int r;
+
+ /* Adds a specific PID to the set of PIDs this unit watches. */
+
+ assert(u);
+ assert(pidref_is_set(pid));
+
+ /* Caller might be sure that this PID belongs to this unit only. Let's take this
+ * opportunity to remove any stalled references to this PID as they can be created
+ * easily (when watching a process which is not our direct child). */
+ if (exclusive)
+ manager_unwatch_pidref(u->manager, pid);
+
+ if (set_contains(u->pids, pid)) /* early exit if already being watched */
+ return 0;
+
+ r = pidref_dup(pid, &pid_dup);
+ if (r < 0)
+ return r;
+
+ /* First, insert into the set of PIDs maintained by the unit */
+ r = set_ensure_put(&u->pids, &pidref_hash_ops_free, pid_dup);
+ if (r < 0)
+ return r;
+
+ pid = TAKE_PTR(pid_dup); /* continue with our copy now that we have installed it properly in our set */
+
+ /* Second, insert it into the simple global table, see if that works */
+ r = hashmap_ensure_put(&u->manager->watch_pids, &pidref_hash_ops_free, pid, u);
+ if (r != -EEXIST)
+ return r;
+
+ /* OK, the key is already assigned to a different unit. That's fine, then add us via the second
+ * hashmap that points to an array. */
+
+ PidRef *old_pid = NULL;
+ Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &old_pid);
+
+ /* Count entries in array */
+ size_t n = 0;
+ for (; array && array[n]; n++)
+ ;
+
+ /* Allocate a new array */
+ _cleanup_free_ Unit **new_array = new(Unit*, n + 2);
+ if (!new_array)
+ return -ENOMEM;
+
+ /* Append us to the end */
+ memcpy_safe(new_array, array, sizeof(Unit*) * n);
+ new_array[n] = u;
+ new_array[n+1] = NULL;
+
+ /* Make sure the hashmap is allocated */
+ r = hashmap_ensure_allocated(&u->manager->watch_pids_more, &pidref_hash_ops_free);
+ if (r < 0)
+ return r;
+
+ /* Add or replace the old array */
+ r = hashmap_replace(u->manager->watch_pids_more, old_pid ?: pid, new_array);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(new_array); /* Now part of the hash table */
+ free(array); /* Which means we can now delete the old version */
+ return 0;
+}
+
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ int r;
+
+ assert(u);
+ assert(pid_is_valid(pid));
+
+ r = pidref_set_pid(&pidref, pid);
+ if (r < 0)
+ return r;
+
+ return unit_watch_pidref(u, &pidref, exclusive);
+}
+
+void unit_unwatch_pidref(Unit *u, PidRef *pid) {
+ assert(u);
+ assert(pidref_is_set(pid));
+
+ /* Remove from the set we maintain for this unit. (And destroy the returned pid eventually) */
+ _cleanup_(pidref_freep) PidRef *pid1 = set_remove(u->pids, pid);
+ if (!pid1)
+ return; /* Early exit if this PID was never watched by us */
+
+ /* First let's drop the unit from the simple hash table, if it is included there */
+ PidRef *pid2 = NULL;
+ Unit *uu = hashmap_get2(u->manager->watch_pids, pid, (void**) &pid2);
+
+ /* Quick validation: iff we are in the watch_pids table then the PidRef object must be the same as in our local pids set */
+ assert((uu == u) == (pid1 == pid2));
+
+ if (uu == u)
+ /* OK, we are in the first table. Let's remove it there then, and we are done already. */
+ assert_se(hashmap_remove_value(u->manager->watch_pids, pid2, uu));
+ else {
+ /* We weren't in the first table, then let's consult the 2nd table that points to an array */
+ PidRef *pid3 = NULL;
+ Unit **array = hashmap_get2(u->manager->watch_pids_more, pid, (void**) &pid3);
+
+ /* Let's iterate through the array, dropping our own entry */
+ size_t m = 0, n = 0;
+ for (; array && array[n]; n++)
+ if (array[n] != u)
+ array[m++] = array[n];
+ if (n == m)
+ return; /* Not there */
+
+ array[m] = NULL; /* set trailing NULL marker on the new end */
+
+ if (m == 0) {
+ /* The array is now empty, remove the entire entry */
+ assert_se(hashmap_remove_value(u->manager->watch_pids_more, pid3, array));
+ free(array);
+ } else {
+ /* The array is not empty, but let's make sure the entry is not keyed by the PidRef
+ * we will delete, but by the PidRef object of the Unit that is now first in the
+ * array. */
+
+ PidRef *new_pid3 = ASSERT_PTR(set_get(array[0]->pids, pid));
+ assert_se(hashmap_replace(u->manager->watch_pids_more, new_pid3, array) >= 0);
+ }
+ }
+}
+
+void unit_unwatch_pid(Unit *u, pid_t pid) {
+ return unit_unwatch_pidref(u, &PIDREF_MAKE_FROM_PID(pid));
+}
+
+void unit_unwatch_all_pids(Unit *u) {
+ assert(u);
+
+ while (!set_isempty(u->pids))
+ unit_unwatch_pidref(u, set_first(u->pids));
+
+ u->pids = set_free(u->pids);
+}
+
+static void unit_tidy_watch_pids(Unit *u) {
+ PidRef *except1, *except2, *e;
+
+ assert(u);
+
+ /* Cleans dead PIDs from our list */
+
+ except1 = unit_main_pid(u);
+ except2 = unit_control_pid(u);
+
+ SET_FOREACH(e, u->pids) {
+ if (pidref_equal(except1, e) || pidref_equal(except2, e))
+ continue;
+
+ if (pidref_is_unwaited(e) <= 0)
+ unit_unwatch_pidref(u, e);
+ }
+}
+
+static int on_rewatch_pids_event(sd_event_source *s, void *userdata) {
+ Unit *u = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ unit_tidy_watch_pids(u);
+ unit_watch_all_pids(u);
+
+ /* If the PID set is empty now, then let's finish this off. */
+ unit_synthesize_cgroup_empty_event(u);
+
+ return 0;
+}
+
+int unit_enqueue_rewatch_pids(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (!u->cgroup_path)
+ return -ENOENT;
+
+ r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
+ if (r < 0)
+ return r;
+ if (r > 0) /* On unified we can use proper notifications */
+ return 0;
+
+ /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new
+ * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it
+ * involves issuing kill(pid, 0) on all processes we watch. */
+
+ if (!u->rewatch_pids_event_source) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+ r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m");
+
+ r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: %m");
+
+ (void) sd_event_source_set_description(s, "tidy-watch-pids");
+
+ u->rewatch_pids_event_source = TAKE_PTR(s);
+ }
+
+ r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m");
+
+ return 0;
+}
+
+void unit_dequeue_rewatch_pids(Unit *u) {
+ int r;
+ assert(u);
+
+ if (!u->rewatch_pids_event_source)
+ return;
+
+ r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m");
+
+ u->rewatch_pids_event_source = sd_event_source_disable_unref(u->rewatch_pids_event_source);
+}
+
+bool unit_job_is_applicable(Unit *u, JobType j) {
+ assert(u);
+ assert(j >= 0 && j < _JOB_TYPE_MAX);
+
+ switch (j) {
+
+ case JOB_VERIFY_ACTIVE:
+ case JOB_START:
+ case JOB_NOP:
+ /* Note that we don't check unit_can_start() here. That's because .device units and suchlike are not
+ * startable by us but may appear due to external events, and it thus makes sense to permit enqueuing
+ * jobs for it. */
+ return true;
+
+ case JOB_STOP:
+ /* Similar as above. However, perpetual units can never be stopped (neither explicitly nor due to
+ * external events), hence it makes no sense to permit enqueuing such a request either. */
+ return !u->perpetual;
+
+ case JOB_RESTART:
+ case JOB_TRY_RESTART:
+ return unit_can_stop(u) && unit_can_start(u);
+
+ case JOB_RELOAD:
+ case JOB_TRY_RELOAD:
+ return unit_can_reload(u);
+
+ case JOB_RELOAD_OR_START:
+ return unit_can_reload(u) && unit_can_start(u);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static Hashmap *unit_get_dependency_hashmap_per_type(Unit *u, UnitDependency d) {
+ Hashmap *deps;
+
+ assert(u);
+ assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+
+ deps = hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+ if (!deps) {
+ _cleanup_hashmap_free_ Hashmap *h = NULL;
+
+ h = hashmap_new(NULL);
+ if (!h)
+ return NULL;
+
+ if (hashmap_ensure_put(&u->dependencies, NULL, UNIT_DEPENDENCY_TO_PTR(d), h) < 0)
+ return NULL;
+
+ deps = TAKE_PTR(h);
+ }
+
+ return deps;
+}
+
+typedef enum NotifyDependencyFlags {
+ NOTIFY_DEPENDENCY_UPDATE_FROM = 1 << 0,
+ NOTIFY_DEPENDENCY_UPDATE_TO = 1 << 1,
+} NotifyDependencyFlags;
+
+static int unit_add_dependency_impl(
+ Unit *u,
+ UnitDependency d,
+ Unit *other,
+ UnitDependencyMask mask) {
+
+ static const UnitDependency inverse_table[_UNIT_DEPENDENCY_MAX] = {
+ [UNIT_REQUIRES] = UNIT_REQUIRED_BY,
+ [UNIT_REQUISITE] = UNIT_REQUISITE_OF,
+ [UNIT_WANTS] = UNIT_WANTED_BY,
+ [UNIT_BINDS_TO] = UNIT_BOUND_BY,
+ [UNIT_PART_OF] = UNIT_CONSISTS_OF,
+ [UNIT_UPHOLDS] = UNIT_UPHELD_BY,
+ [UNIT_REQUIRED_BY] = UNIT_REQUIRES,
+ [UNIT_REQUISITE_OF] = UNIT_REQUISITE,
+ [UNIT_WANTED_BY] = UNIT_WANTS,
+ [UNIT_BOUND_BY] = UNIT_BINDS_TO,
+ [UNIT_CONSISTS_OF] = UNIT_PART_OF,
+ [UNIT_UPHELD_BY] = UNIT_UPHOLDS,
+ [UNIT_CONFLICTS] = UNIT_CONFLICTED_BY,
+ [UNIT_CONFLICTED_BY] = UNIT_CONFLICTS,
+ [UNIT_BEFORE] = UNIT_AFTER,
+ [UNIT_AFTER] = UNIT_BEFORE,
+ [UNIT_ON_SUCCESS] = UNIT_ON_SUCCESS_OF,
+ [UNIT_ON_SUCCESS_OF] = UNIT_ON_SUCCESS,
+ [UNIT_ON_FAILURE] = UNIT_ON_FAILURE_OF,
+ [UNIT_ON_FAILURE_OF] = UNIT_ON_FAILURE,
+ [UNIT_TRIGGERS] = UNIT_TRIGGERED_BY,
+ [UNIT_TRIGGERED_BY] = UNIT_TRIGGERS,
+ [UNIT_PROPAGATES_RELOAD_TO] = UNIT_RELOAD_PROPAGATED_FROM,
+ [UNIT_RELOAD_PROPAGATED_FROM] = UNIT_PROPAGATES_RELOAD_TO,
+ [UNIT_PROPAGATES_STOP_TO] = UNIT_STOP_PROPAGATED_FROM,
+ [UNIT_STOP_PROPAGATED_FROM] = UNIT_PROPAGATES_STOP_TO,
+ [UNIT_JOINS_NAMESPACE_OF] = UNIT_JOINS_NAMESPACE_OF, /* symmetric! 👓 */
+ [UNIT_REFERENCES] = UNIT_REFERENCED_BY,
+ [UNIT_REFERENCED_BY] = UNIT_REFERENCES,
+ [UNIT_IN_SLICE] = UNIT_SLICE_OF,
+ [UNIT_SLICE_OF] = UNIT_IN_SLICE,
+ };
+
+ Hashmap *u_deps, *other_deps;
+ UnitDependencyInfo u_info, u_info_old, other_info, other_info_old;
+ NotifyDependencyFlags flags = 0;
+ int r;
+
+ assert(u);
+ assert(other);
+ assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+ assert(inverse_table[d] >= 0 && inverse_table[d] < _UNIT_DEPENDENCY_MAX);
+ assert(mask > 0 && mask < _UNIT_DEPENDENCY_MASK_FULL);
+
+ /* Ensure the following two hashmaps for each unit exist:
+ * - the top-level dependency hashmap that maps UnitDependency → Hashmap(Unit* → UnitDependencyInfo),
+ * - the inner hashmap, that maps Unit* → UnitDependencyInfo, for the specified dependency type. */
+ u_deps = unit_get_dependency_hashmap_per_type(u, d);
+ if (!u_deps)
+ return -ENOMEM;
+
+ other_deps = unit_get_dependency_hashmap_per_type(other, inverse_table[d]);
+ if (!other_deps)
+ return -ENOMEM;
+
+ /* Save the original dependency info. */
+ u_info.data = u_info_old.data = hashmap_get(u_deps, other);
+ other_info.data = other_info_old.data = hashmap_get(other_deps, u);
+
+ /* Update dependency info. */
+ u_info.origin_mask |= mask;
+ other_info.destination_mask |= mask;
+
+ /* Save updated dependency info. */
+ if (u_info.data != u_info_old.data) {
+ r = hashmap_replace(u_deps, other, u_info.data);
+ if (r < 0)
+ return r;
+
+ flags = NOTIFY_DEPENDENCY_UPDATE_FROM;
+ }
+
+ if (other_info.data != other_info_old.data) {
+ r = hashmap_replace(other_deps, u, other_info.data);
+ if (r < 0) {
+ if (u_info.data != u_info_old.data) {
+ /* Restore the old dependency. */
+ if (u_info_old.data)
+ (void) hashmap_update(u_deps, other, u_info_old.data);
+ else
+ hashmap_remove(u_deps, other);
+ }
+ return r;
+ }
+
+ flags |= NOTIFY_DEPENDENCY_UPDATE_TO;
+ }
+
+ return flags;
+}
+
+int unit_add_dependency(
+ Unit *u,
+ UnitDependency d,
+ Unit *other,
+ bool add_reference,
+ UnitDependencyMask mask) {
+
+ UnitDependencyAtom a;
+ int r;
+
+ /* Helper to know whether sending a notification is necessary or not: if the dependency is already
+ * there, no need to notify! */
+ NotifyDependencyFlags notify_flags;
+
+ assert(u);
+ assert(d >= 0 && d < _UNIT_DEPENDENCY_MAX);
+ assert(other);
+
+ u = unit_follow_merge(u);
+ other = unit_follow_merge(other);
+ a = unit_dependency_to_atom(d);
+ assert(a >= 0);
+
+ /* We won't allow dependencies on ourselves. We will not consider them an error however. */
+ if (u == other) {
+ if (unit_should_warn_about_dependency(d))
+ log_unit_warning(u, "Dependency %s=%s is dropped.",
+ unit_dependency_to_string(d), u->id);
+ return 0;
+ }
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ /* Note that ordering a device unit after a unit is permitted since it allows to start its job
+ * running timeout at a specific time. */
+ if (FLAGS_SET(a, UNIT_ATOM_BEFORE) && other->type == UNIT_DEVICE) {
+ log_unit_warning(u, "Dependency Before=%s ignored (.device units cannot be delayed)", other->id);
+ return 0;
+ }
+
+ if (FLAGS_SET(a, UNIT_ATOM_ON_FAILURE) && !UNIT_VTABLE(u)->can_fail) {
+ log_unit_warning(u, "Requested dependency OnFailure=%s ignored (%s units cannot fail).", other->id, unit_type_to_string(u->type));
+ return 0;
+ }
+
+ if (FLAGS_SET(a, UNIT_ATOM_TRIGGERS) && !UNIT_VTABLE(u)->can_trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Triggers=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(u->type));
+ if (FLAGS_SET(a, UNIT_ATOM_TRIGGERED_BY) && !UNIT_VTABLE(other)->can_trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency TriggeredBy=%s refused (%s units cannot trigger other units).", other->id, unit_type_to_string(other->type));
+
+ if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && other->type != UNIT_SLICE)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Slice=%s refused (%s is not a slice unit).", other->id, other->id);
+ if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && u->type != UNIT_SLICE)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency SliceOf=%s refused (%s is not a slice unit).", other->id, u->id);
+
+ if (FLAGS_SET(a, UNIT_ATOM_IN_SLICE) && !UNIT_HAS_CGROUP_CONTEXT(u))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency Slice=%s refused (%s is not a cgroup unit).", other->id, u->id);
+
+ if (FLAGS_SET(a, UNIT_ATOM_SLICE_OF) && !UNIT_HAS_CGROUP_CONTEXT(other))
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL),
+ "Requested dependency SliceOf=%s refused (%s is not a cgroup unit).", other->id, other->id);
+
+ r = unit_add_dependency_impl(u, d, other, mask);
+ if (r < 0)
+ return r;
+ notify_flags = r;
+
+ if (add_reference) {
+ r = unit_add_dependency_impl(u, UNIT_REFERENCES, other, mask);
+ if (r < 0)
+ return r;
+ notify_flags |= r;
+ }
+
+ if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_FROM))
+ unit_add_to_dbus_queue(u);
+ if (FLAGS_SET(notify_flags, NOTIFY_DEPENDENCY_UPDATE_TO))
+ unit_add_to_dbus_queue(other);
+
+ return notify_flags != 0;
+}
+
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask) {
+ int r = 0, s = 0;
+
+ assert(u);
+ assert(d >= 0 || e >= 0);
+
+ if (d >= 0) {
+ r = unit_add_dependency(u, d, other, add_reference, mask);
+ if (r < 0)
+ return r;
+ }
+
+ if (e >= 0) {
+ s = unit_add_dependency(u, e, other, add_reference, mask);
+ if (s < 0)
+ return s;
+ }
+
+ return r > 0 || s > 0;
+}
+
+static int resolve_template(Unit *u, const char *name, char **buf, const char **ret) {
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(buf);
+ assert(ret);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_TEMPLATE)) {
+ *buf = NULL;
+ *ret = name;
+ return 0;
+ }
+
+ if (u->instance)
+ r = unit_name_replace_instance(name, u->instance, buf);
+ else {
+ _cleanup_free_ char *i = NULL;
+
+ r = unit_name_to_prefix(u->id, &i);
+ if (r < 0)
+ return r;
+
+ r = unit_name_replace_instance(name, i, buf);
+ }
+ if (r < 0)
+ return r;
+
+ *ret = *buf;
+ return 0;
+}
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask) {
+ _cleanup_free_ char *buf = NULL;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ r = resolve_template(u, name, &buf, &name);
+ if (r < 0)
+ return r;
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+ if (r < 0)
+ return r;
+
+ return unit_add_dependency(u, d, other, add_reference, mask);
+}
+
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask) {
+ _cleanup_free_ char *buf = NULL;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(name);
+
+ r = resolve_template(u, name, &buf, &name);
+ if (r < 0)
+ return r;
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ r = manager_load_unit(u->manager, name, NULL, NULL, &other);
+ if (r < 0)
+ return r;
+
+ return unit_add_two_dependencies(u, d, e, other, add_reference, mask);
+}
+
+int set_unit_path(const char *p) {
+ /* This is mostly for debug purposes */
+ return RET_NERRNO(setenv("SYSTEMD_UNIT_PATH", p, 1));
+}
+
+char *unit_dbus_path(Unit *u) {
+ assert(u);
+
+ if (!u->id)
+ return NULL;
+
+ return unit_dbus_path_from_name(u->id);
+}
+
+char *unit_dbus_path_invocation_id(Unit *u) {
+ assert(u);
+
+ if (sd_id128_is_null(u->invocation_id))
+ return NULL;
+
+ return unit_dbus_path_from_name(u->invocation_id_string);
+}
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id) {
+ int r;
+
+ assert(u);
+
+ /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */
+
+ if (sd_id128_equal(u->invocation_id, id))
+ return 0;
+
+ if (!sd_id128_is_null(u->invocation_id))
+ (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u);
+
+ if (sd_id128_is_null(id)) {
+ r = 0;
+ goto reset;
+ }
+
+ r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops);
+ if (r < 0)
+ goto reset;
+
+ u->invocation_id = id;
+ sd_id128_to_string(id, u->invocation_id_string);
+
+ r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u);
+ if (r < 0)
+ goto reset;
+
+ return 0;
+
+reset:
+ u->invocation_id = SD_ID128_NULL;
+ u->invocation_id_string[0] = 0;
+ return r;
+}
+
+int unit_set_slice(Unit *u, Unit *slice) {
+ int r;
+
+ assert(u);
+ assert(slice);
+
+ /* Sets the unit slice if it has not been set before. Is extra careful, to only allow this for units
+ * that actually have a cgroup context. Also, we don't allow to set this for slices (since the parent
+ * slice is derived from the name). Make sure the unit we set is actually a slice. */
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u))
+ return -EOPNOTSUPP;
+
+ if (u->type == UNIT_SLICE)
+ return -EINVAL;
+
+ if (unit_active_state(u) != UNIT_INACTIVE)
+ return -EBUSY;
+
+ if (slice->type != UNIT_SLICE)
+ return -EINVAL;
+
+ if (unit_has_name(u, SPECIAL_INIT_SCOPE) &&
+ !unit_has_name(slice, SPECIAL_ROOT_SLICE))
+ return -EPERM;
+
+ if (UNIT_GET_SLICE(u) == slice)
+ return 0;
+
+ /* Disallow slice changes if @u is already bound to cgroups */
+ if (UNIT_GET_SLICE(u) && u->cgroup_realized)
+ return -EBUSY;
+
+ /* Remove any slices assigned prior; we should only have one UNIT_IN_SLICE dependency */
+ if (UNIT_GET_SLICE(u))
+ unit_remove_dependencies(u, UNIT_DEPENDENCY_SLICE_PROPERTY);
+
+ r = unit_add_dependency(u, UNIT_IN_SLICE, slice, true, UNIT_DEPENDENCY_SLICE_PROPERTY);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int unit_set_default_slice(Unit *u) {
+ const char *slice_name;
+ Unit *slice;
+ int r;
+
+ assert(u);
+
+ if (u->manager && FLAGS_SET(u->manager->test_run_flags, MANAGER_TEST_RUN_IGNORE_DEPENDENCIES))
+ return 0;
+
+ if (UNIT_GET_SLICE(u))
+ return 0;
+
+ if (u->instance) {
+ _cleanup_free_ char *prefix = NULL, *escaped = NULL;
+
+ /* Implicitly place all instantiated units in their
+ * own per-template slice */
+
+ r = unit_name_to_prefix(u->id, &prefix);
+ if (r < 0)
+ return r;
+
+ /* The prefix is already escaped, but it might include
+ * "-" which has a special meaning for slice units,
+ * hence escape it here extra. */
+ escaped = unit_name_escape(prefix);
+ if (!escaped)
+ return -ENOMEM;
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ slice_name = strjoina("system-", escaped, ".slice");
+ else
+ slice_name = strjoina("app-", escaped, ".slice");
+
+ } else if (unit_is_extrinsic(u))
+ /* Keep all extrinsic units (e.g. perpetual units and swap and mount units in user mode) in
+ * the root slice. They don't really belong in one of the subslices. */
+ slice_name = SPECIAL_ROOT_SLICE;
+
+ else if (MANAGER_IS_SYSTEM(u->manager))
+ slice_name = SPECIAL_SYSTEM_SLICE;
+ else
+ slice_name = SPECIAL_APP_SLICE;
+
+ r = manager_load_unit(u->manager, slice_name, NULL, NULL, &slice);
+ if (r < 0)
+ return r;
+
+ return unit_set_slice(u, slice);
+}
+
+const char *unit_slice_name(Unit *u) {
+ Unit *slice;
+ assert(u);
+
+ slice = UNIT_GET_SLICE(u);
+ if (!slice)
+ return NULL;
+
+ return slice->id;
+}
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found) {
+ _cleanup_free_ char *t = NULL;
+ int r;
+
+ assert(u);
+ assert(type);
+ assert(_found);
+
+ r = unit_name_change_suffix(u->id, type, &t);
+ if (r < 0)
+ return r;
+ if (unit_has_name(u, t))
+ return -EINVAL;
+
+ r = manager_load_unit(u->manager, t, NULL, NULL, _found);
+ assert(r < 0 || *_found != u);
+ return r;
+}
+
+static int signal_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const char *new_owner;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "sss", NULL, NULL, &new_owner);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ if (UNIT_VTABLE(u)->bus_name_owner_change)
+ UNIT_VTABLE(u)->bus_name_owner_change(u, empty_to_null(new_owner));
+
+ return 0;
+}
+
+static int get_name_owner_handler(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ const sd_bus_error *e;
+ const char *new_owner;
+ Unit *u = ASSERT_PTR(userdata);
+ int r;
+
+ assert(message);
+
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+
+ e = sd_bus_message_get_error(message);
+ if (e) {
+ if (!sd_bus_error_has_name(e, SD_BUS_ERROR_NAME_HAS_NO_OWNER)) {
+ r = sd_bus_error_get_errno(e);
+ log_unit_error_errno(u, r,
+ "Unexpected error response from GetNameOwner(): %s",
+ bus_error_message(e, r));
+ }
+
+ new_owner = NULL;
+ } else {
+ r = sd_bus_message_read(message, "s", &new_owner);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ assert(!isempty(new_owner));
+ }
+
+ if (UNIT_VTABLE(u)->bus_name_owner_change)
+ UNIT_VTABLE(u)->bus_name_owner_change(u, new_owner);
+
+ return 0;
+}
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ const char *match;
+ usec_t timeout_usec = 0;
+ int r;
+
+ assert(u);
+ assert(bus);
+ assert(name);
+
+ if (u->match_bus_slot || u->get_name_owner_slot)
+ return -EBUSY;
+
+ /* NameOwnerChanged and GetNameOwner is used to detect when a service finished starting up. The dbus
+ * call timeout shouldn't be earlier than that. If we couldn't get the start timeout, use the default
+ * value defined above. */
+ if (UNIT_VTABLE(u)->get_timeout_start_usec)
+ timeout_usec = UNIT_VTABLE(u)->get_timeout_start_usec(u);
+
+ match = strjoina("type='signal',"
+ "sender='org.freedesktop.DBus',"
+ "path='/org/freedesktop/DBus',"
+ "interface='org.freedesktop.DBus',"
+ "member='NameOwnerChanged',"
+ "arg0='", name, "'");
+
+ r = bus_add_match_full(
+ bus,
+ &u->match_bus_slot,
+ true,
+ match,
+ signal_name_owner_changed,
+ NULL,
+ u,
+ timeout_usec);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_new_method_call(
+ bus,
+ &m,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.DBus",
+ "GetNameOwner");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "s", name);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_call_async(
+ bus,
+ &u->get_name_owner_slot,
+ m,
+ get_name_owner_handler,
+ u,
+ timeout_usec);
+
+ if (r < 0) {
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ return r;
+ }
+
+ log_unit_debug(u, "Watching D-Bus name '%s'.", name);
+ return 0;
+}
+
+int unit_watch_bus_name(Unit *u, const char *name) {
+ int r;
+
+ assert(u);
+ assert(name);
+
+ /* Watch a specific name on the bus. We only support one unit
+ * watching each name for now. */
+
+ if (u->manager->api_bus) {
+ /* If the bus is already available, install the match directly.
+ * Otherwise, just put the name in the list. bus_setup_api() will take care later. */
+ r = unit_install_bus_match(u, u->manager->api_bus, name);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to subscribe to NameOwnerChanged signal for '%s': %m", name);
+ }
+
+ r = hashmap_put(u->manager->watch_bus, name, u);
+ if (r < 0) {
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+ return log_warning_errno(r, "Failed to put bus name to hashmap: %m");
+ }
+
+ return 0;
+}
+
+void unit_unwatch_bus_name(Unit *u, const char *name) {
+ assert(u);
+ assert(name);
+
+ (void) hashmap_remove_value(u->manager->watch_bus, name, u);
+ u->match_bus_slot = sd_bus_slot_unref(u->match_bus_slot);
+ u->get_name_owner_slot = sd_bus_slot_unref(u->get_name_owner_slot);
+}
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency dep, UnitDependencyMask mask) {
+ _cleanup_free_ char *e = NULL;
+ Unit *device;
+ int r;
+
+ assert(u);
+
+ /* Adds in links to the device node that this unit is based on */
+ if (isempty(what))
+ return 0;
+
+ if (!is_device_path(what))
+ return 0;
+
+ /* When device units aren't supported (such as in a container), don't create dependencies on them. */
+ if (!unit_type_supported(UNIT_DEVICE))
+ return 0;
+
+ r = unit_name_from_path(what, ".device", &e);
+ if (r < 0)
+ return r;
+
+ r = manager_load_unit(u->manager, e, NULL, NULL, &device);
+ if (r < 0)
+ return r;
+
+ if (dep == UNIT_REQUIRES && device_shall_be_bound_by(device, u))
+ dep = UNIT_BINDS_TO;
+
+ return unit_add_two_dependencies(u, UNIT_AFTER,
+ MANAGER_IS_SYSTEM(u->manager) ? dep : UNIT_WANTS,
+ device, true, mask);
+}
+
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask) {
+ _cleanup_free_ char *escaped = NULL, *target = NULL;
+ int r;
+
+ assert(u);
+
+ if (isempty(what))
+ return 0;
+
+ if (!path_startswith(what, "/dev/"))
+ return 0;
+
+ /* If we don't support devices, then also don't bother with blockdev@.target */
+ if (!unit_type_supported(UNIT_DEVICE))
+ return 0;
+
+ r = unit_name_path_escape(what, &escaped);
+ if (r < 0)
+ return r;
+
+ r = unit_name_build("blockdev", escaped, ".target", &target);
+ if (r < 0)
+ return r;
+
+ return unit_add_dependency_by_name(u, UNIT_AFTER, target, true, mask);
+}
+
+int unit_coldplug(Unit *u) {
+ int r = 0;
+
+ assert(u);
+
+ /* Make sure we don't enter a loop, when coldplugging recursively. */
+ if (u->coldplugged)
+ return 0;
+
+ u->coldplugged = true;
+
+ STRV_FOREACH(i, u->deserialized_refs)
+ RET_GATHER(r, bus_unit_track_add_name(u, *i));
+
+ u->deserialized_refs = strv_free(u->deserialized_refs);
+
+ if (UNIT_VTABLE(u)->coldplug)
+ RET_GATHER(r, UNIT_VTABLE(u)->coldplug(u));
+
+ if (u->job)
+ RET_GATHER(r, job_coldplug(u->job));
+ if (u->nop_job)
+ RET_GATHER(r, job_coldplug(u->nop_job));
+
+ unit_modify_nft_set(u, /* add = */ true);
+ return r;
+}
+
+void unit_catchup(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->catchup)
+ UNIT_VTABLE(u)->catchup(u);
+
+ unit_cgroup_catchup(u);
+}
+
+static bool fragment_mtime_newer(const char *path, usec_t mtime, bool path_masked) {
+ struct stat st;
+
+ if (!path)
+ return false;
+
+ /* If the source is some virtual kernel file system, then we assume we watch it anyway, and hence pretend we
+ * are never out-of-date. */
+ if (PATH_STARTSWITH_SET(path, "/proc", "/sys"))
+ return false;
+
+ if (stat(path, &st) < 0)
+ /* What, cannot access this anymore? */
+ return true;
+
+ if (path_masked)
+ /* For masked files check if they are still so */
+ return !null_or_empty(&st);
+ else
+ /* For non-empty files check the mtime */
+ return timespec_load(&st.st_mtim) > mtime;
+
+ return false;
+}
+
+bool unit_need_daemon_reload(Unit *u) {
+ _cleanup_strv_free_ char **dropins = NULL;
+
+ assert(u);
+ assert(u->manager);
+
+ if (u->manager->unit_file_state_outdated)
+ return true;
+
+ /* For unit files, we allow masking… */
+ if (fragment_mtime_newer(u->fragment_path, u->fragment_mtime,
+ u->load_state == UNIT_MASKED))
+ return true;
+
+ /* Source paths should not be masked… */
+ if (fragment_mtime_newer(u->source_path, u->source_mtime, false))
+ return true;
+
+ if (u->load_state == UNIT_LOADED)
+ (void) unit_find_dropin_paths(u, &dropins);
+ if (!strv_equal(u->dropin_paths, dropins))
+ return true;
+
+ /* … any drop-ins that are masked are simply omitted from the list. */
+ STRV_FOREACH(path, u->dropin_paths)
+ if (fragment_mtime_newer(*path, u->dropin_mtime, false))
+ return true;
+
+ return false;
+}
+
+void unit_reset_failed(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->reset_failed)
+ UNIT_VTABLE(u)->reset_failed(u);
+
+ ratelimit_reset(&u->start_ratelimit);
+ u->start_limit_hit = false;
+}
+
+Unit *unit_following(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->following)
+ return UNIT_VTABLE(u)->following(u);
+
+ return NULL;
+}
+
+bool unit_stop_pending(Unit *u) {
+ assert(u);
+
+ /* This call does check the current state of the unit. It's
+ * hence useful to be called from state change calls of the
+ * unit itself, where the state isn't updated yet. This is
+ * different from unit_inactive_or_pending() which checks both
+ * the current state and for a queued job. */
+
+ return unit_has_job_type(u, JOB_STOP);
+}
+
+bool unit_inactive_or_pending(Unit *u) {
+ assert(u);
+
+ /* Returns true if the unit is inactive or going down */
+
+ if (UNIT_IS_INACTIVE_OR_DEACTIVATING(unit_active_state(u)))
+ return true;
+
+ if (unit_stop_pending(u))
+ return true;
+
+ return false;
+}
+
+bool unit_active_or_pending(Unit *u) {
+ assert(u);
+
+ /* Returns true if the unit is active or going up */
+
+ if (UNIT_IS_ACTIVE_OR_ACTIVATING(unit_active_state(u)))
+ return true;
+
+ if (u->job &&
+ IN_SET(u->job->type, JOB_START, JOB_RELOAD_OR_START, JOB_RESTART))
+ return true;
+
+ return false;
+}
+
+bool unit_will_restart_default(Unit *u) {
+ assert(u);
+
+ return unit_has_job_type(u, JOB_START);
+}
+
+bool unit_will_restart(Unit *u) {
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->will_restart)
+ return false;
+
+ return UNIT_VTABLE(u)->will_restart(u);
+}
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->notify_cgroup_oom)
+ UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom);
+}
+
+static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+ int r;
+
+ pid_set = set_new(NULL);
+ if (!pid_set)
+ return NULL;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ if (main_pid > 0) {
+ r = set_put(pid_set, PID_TO_PTR(main_pid));
+ if (r < 0)
+ return NULL;
+ }
+
+ if (control_pid > 0) {
+ r = set_put(pid_set, PID_TO_PTR(control_pid));
+ if (r < 0)
+ return NULL;
+ }
+
+ return TAKE_PTR(pid_set);
+}
+
+static int kill_common_log(const PidRef *pid, int signo, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+ Unit *u = ASSERT_PTR(userdata);
+
+ (void) pidref_get_comm(pid, &comm);
+
+ log_unit_info(u, "Sending signal SIG%s to process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), pid->pid, strna(comm));
+
+ return 1;
+}
+
+static int kill_or_sigqueue(PidRef* pidref, int signo, int code, int value) {
+ assert(pidref_is_set(pidref));
+ assert(SIGNAL_VALID(signo));
+
+ switch (code) {
+
+ case SI_USER:
+ log_debug("Killing " PID_FMT " with signal SIG%s.", pidref->pid, signal_to_string(signo));
+ return pidref_kill(pidref, signo);
+
+ case SI_QUEUE:
+ log_debug("Enqueuing value %i to " PID_FMT " on signal SIG%s.", value, pidref->pid, signal_to_string(signo));
+ return pidref_sigqueue(pidref, signo, value);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_kill(
+ Unit *u,
+ KillWho who,
+ int signo,
+ int code,
+ int value,
+ sd_bus_error *error) {
+
+ PidRef *main_pid, *control_pid;
+ bool killed = false;
+ int ret = 0, r;
+
+ /* This is the common implementation for explicit user-requested killing of unit processes, shared by
+ * various unit types. Do not confuse with unit_kill_context(), which is what we use when we want to
+ * stop a service ourselves. */
+
+ assert(u);
+ assert(who >= 0);
+ assert(who < _KILL_WHO_MAX);
+ assert(SIGNAL_VALID(signo));
+ assert(IN_SET(code, SI_USER, SI_QUEUE));
+
+ main_pid = unit_main_pid(u);
+ control_pid = unit_control_pid(u);
+
+ if (!UNIT_HAS_CGROUP_CONTEXT(u) && !main_pid && !control_pid)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_NOT_SUPPORTED, "Unit type does not support process killing.");
+
+ if (IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL)) {
+ if (!main_pid)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no main processes", unit_type_to_string(u->type));
+ if (!pidref_is_set(main_pid))
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No main process to kill");
+ }
+
+ if (IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL)) {
+ if (!control_pid)
+ return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_PROCESS, "%s units have no control processes", unit_type_to_string(u->type));
+ if (!pidref_is_set(control_pid))
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No control process to kill");
+ }
+
+ if (pidref_is_set(control_pid) &&
+ IN_SET(who, KILL_CONTROL, KILL_CONTROL_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+ _cleanup_free_ char *comm = NULL;
+ (void) pidref_get_comm(control_pid, &comm);
+
+ r = kill_or_sigqueue(control_pid, signo, code, value);
+ if (r < 0) {
+ ret = r;
+
+ /* Report this failure both to the logs and to the client */
+ sd_bus_error_set_errnof(
+ error, r,
+ "Failed to send signal SIG%s to control process " PID_FMT " (%s): %m",
+ signal_to_string(signo), control_pid->pid, strna(comm));
+ log_unit_warning_errno(
+ u, r,
+ "Failed to send signal SIG%s to control process " PID_FMT " (%s) on client request: %m",
+ signal_to_string(signo), control_pid->pid, strna(comm));
+ } else {
+ log_unit_info(u, "Sent signal SIG%s to control process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), control_pid->pid, strna(comm));
+ killed = true;
+ }
+ }
+
+ if (pidref_is_set(main_pid) &&
+ IN_SET(who, KILL_MAIN, KILL_MAIN_FAIL, KILL_ALL, KILL_ALL_FAIL)) {
+ _cleanup_free_ char *comm = NULL;
+ (void) pidref_get_comm(main_pid, &comm);
+
+ r = kill_or_sigqueue(main_pid, signo, code, value);
+ if (r < 0) {
+ if (ret == 0) {
+ ret = r;
+
+ sd_bus_error_set_errnof(
+ error, r,
+ "Failed to send signal SIG%s to main process " PID_FMT " (%s): %m",
+ signal_to_string(signo), main_pid->pid, strna(comm));
+ }
+
+ log_unit_warning_errno(
+ u, r,
+ "Failed to send signal SIG%s to main process " PID_FMT " (%s) on client request: %m",
+ signal_to_string(signo), main_pid->pid, strna(comm));
+
+ } else {
+ log_unit_info(u, "Sent signal SIG%s to main process " PID_FMT " (%s) on client request.",
+ signal_to_string(signo), main_pid->pid, strna(comm));
+ killed = true;
+ }
+ }
+
+ /* Note: if we shall enqueue rather than kill we won't do this via the cgroup mechanism, since it
+ * doesn't really make much sense (and given that enqueued values are a relatively expensive
+ * resource, and we shouldn't allow us to be subjects for such allocation sprees) */
+ if (IN_SET(who, KILL_ALL, KILL_ALL_FAIL) && u->cgroup_path && code == SI_USER) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+ if (!pid_set)
+ return log_oom();
+
+ r = cg_kill_recursive(u->cgroup_path, signo, 0, pid_set, kill_common_log, u);
+ if (r < 0) {
+ if (!IN_SET(r, -ESRCH, -ENOENT)) {
+ if (ret == 0) {
+ ret = r;
+
+ sd_bus_error_set_errnof(
+ error, r,
+ "Failed to send signal SIG%s to auxiliary processes: %m",
+ signal_to_string(signo));
+ }
+
+ log_unit_warning_errno(
+ u, r,
+ "Failed to send signal SIG%s to auxiliary processes on client request: %m",
+ signal_to_string(signo));
+ }
+ } else
+ killed = true;
+ }
+
+ /* If the "fail" versions of the operation are requested, then complain if the set of processes we killed is empty */
+ if (ret == 0 && !killed && IN_SET(who, KILL_ALL_FAIL, KILL_CONTROL_FAIL, KILL_MAIN_FAIL))
+ return sd_bus_error_set_const(error, BUS_ERROR_NO_SUCH_PROCESS, "No matching processes to kill");
+
+ return ret;
+}
+
+int unit_following_set(Unit *u, Set **s) {
+ assert(u);
+ assert(s);
+
+ if (UNIT_VTABLE(u)->following_set)
+ return UNIT_VTABLE(u)->following_set(u, s);
+
+ *s = NULL;
+ return 0;
+}
+
+UnitFileState unit_get_unit_file_state(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->unit_file_state < 0 && u->fragment_path) {
+ r = unit_file_get_state(
+ u->manager->runtime_scope,
+ NULL,
+ u->id,
+ &u->unit_file_state);
+ if (r < 0)
+ u->unit_file_state = UNIT_FILE_BAD;
+ }
+
+ return u->unit_file_state;
+}
+
+PresetAction unit_get_unit_file_preset(Unit *u) {
+ int r;
+
+ assert(u);
+
+ if (u->unit_file_preset < 0 && u->fragment_path) {
+ _cleanup_free_ char *bn = NULL;
+
+ r = path_extract_filename(u->fragment_path, &bn);
+ if (r < 0)
+ return (u->unit_file_preset = r);
+
+ if (r == O_DIRECTORY)
+ return (u->unit_file_preset = -EISDIR);
+
+ u->unit_file_preset = unit_file_query_preset(
+ u->manager->runtime_scope,
+ NULL,
+ bn,
+ NULL);
+ }
+
+ return u->unit_file_preset;
+}
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target) {
+ assert(ref);
+ assert(source);
+ assert(target);
+
+ if (ref->target)
+ unit_ref_unset(ref);
+
+ ref->source = source;
+ ref->target = target;
+ LIST_PREPEND(refs_by_target, target->refs_by_target, ref);
+ return target;
+}
+
+void unit_ref_unset(UnitRef *ref) {
+ assert(ref);
+
+ if (!ref->target)
+ return;
+
+ /* We are about to drop a reference to the unit, make sure the garbage collection has a look at it as it might
+ * be unreferenced now. */
+ unit_add_to_gc_queue(ref->target);
+
+ LIST_REMOVE(refs_by_target, ref->target->refs_by_target, ref);
+ ref->source = ref->target = NULL;
+}
+
+static int user_from_unit_name(Unit *u, char **ret) {
+
+ static const uint8_t hash_key[] = {
+ 0x58, 0x1a, 0xaf, 0xe6, 0x28, 0x58, 0x4e, 0x96,
+ 0xb4, 0x4e, 0xf5, 0x3b, 0x8c, 0x92, 0x07, 0xec
+ };
+
+ _cleanup_free_ char *n = NULL;
+ int r;
+
+ r = unit_name_to_prefix(u->id, &n);
+ if (r < 0)
+ return r;
+
+ if (valid_user_group_name(n, 0)) {
+ *ret = TAKE_PTR(n);
+ return 0;
+ }
+
+ /* If we can't use the unit name as a user name, then let's hash it and use that */
+ if (asprintf(ret, "_du%016" PRIx64, siphash24(n, strlen(n), hash_key)) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int unit_patch_contexts(Unit *u) {
+ CGroupContext *cc;
+ ExecContext *ec;
+ int r;
+
+ assert(u);
+
+ /* Patch in the manager defaults into the exec and cgroup
+ * contexts, _after_ the rest of the settings have been
+ * initialized */
+
+ ec = unit_get_exec_context(u);
+ if (ec) {
+ /* This only copies in the ones that need memory */
+ for (unsigned i = 0; i < _RLIMIT_MAX; i++)
+ if (u->manager->defaults.rlimit[i] && !ec->rlimit[i]) {
+ ec->rlimit[i] = newdup(struct rlimit, u->manager->defaults.rlimit[i], 1);
+ if (!ec->rlimit[i])
+ return -ENOMEM;
+ }
+
+ if (MANAGER_IS_USER(u->manager) &&
+ !ec->working_directory) {
+
+ r = get_home_dir(&ec->working_directory);
+ if (r < 0)
+ return r;
+
+ /* Allow user services to run, even if the
+ * home directory is missing */
+ ec->working_directory_missing_ok = true;
+ }
+
+ if (ec->private_devices)
+ ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
+
+ if (ec->protect_kernel_modules)
+ ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
+
+ if (ec->protect_kernel_logs)
+ ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYSLOG);
+
+ if (ec->protect_clock)
+ ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_SYS_TIME) | (UINT64_C(1) << CAP_WAKE_ALARM));
+
+ if (ec->dynamic_user) {
+ if (!ec->user) {
+ r = user_from_unit_name(u, &ec->user);
+ if (r < 0)
+ return r;
+ }
+
+ if (!ec->group) {
+ ec->group = strdup(ec->user);
+ if (!ec->group)
+ return -ENOMEM;
+ }
+
+ /* If the dynamic user option is on, let's make sure that the unit can't leave its
+ * UID/GID around in the file system or on IPC objects. Hence enforce a strict
+ * sandbox. */
+
+ ec->private_tmp = true;
+ ec->remove_ipc = true;
+ ec->protect_system = PROTECT_SYSTEM_STRICT;
+ if (ec->protect_home == PROTECT_HOME_NO)
+ ec->protect_home = PROTECT_HOME_READ_ONLY;
+
+ /* Make sure this service can neither benefit from SUID/SGID binaries nor create
+ * them. */
+ ec->no_new_privileges = true;
+ ec->restrict_suid_sgid = true;
+ }
+
+ for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
+ exec_directory_sort(ec->directories + dt);
+ }
+
+ cc = unit_get_cgroup_context(u);
+ if (cc && ec) {
+
+ if (ec->private_devices &&
+ cc->device_policy == CGROUP_DEVICE_POLICY_AUTO)
+ cc->device_policy = CGROUP_DEVICE_POLICY_CLOSED;
+
+ /* Only add these if needed, as they imply that everything else is blocked. */
+ if (cc->device_policy != CGROUP_DEVICE_POLICY_AUTO || cc->device_allow) {
+ if (ec->root_image || ec->mount_images) {
+
+ /* When RootImage= or MountImages= is specified, the following devices are touched. */
+ FOREACH_STRING(p, "/dev/loop-control", "/dev/mapper/control") {
+ r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+ if (r < 0)
+ return r;
+ }
+ FOREACH_STRING(p, "block-loop", "block-blkext", "block-device-mapper") {
+ r = cgroup_context_add_device_allow(cc, p, CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE|CGROUP_DEVICE_MKNOD);
+ if (r < 0)
+ return r;
+ }
+
+ /* Make sure "block-loop" can be resolved, i.e. make sure "loop" shows up in /proc/devices.
+ * Same for mapper and verity. */
+ FOREACH_STRING(p, "modprobe@loop.service", "modprobe@dm_mod.service", "modprobe@dm_verity.service") {
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, p, true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (ec->protect_clock) {
+ r = cgroup_context_add_device_allow(cc, "char-rtc", CGROUP_DEVICE_READ);
+ if (r < 0)
+ return r;
+ }
+
+ /* If there are encrypted credentials we might need to access the TPM. */
+ if (exec_context_has_encrypted_credentials(ec)) {
+ r = cgroup_context_add_device_allow(cc, "char-tpm", CGROUP_DEVICE_READ|CGROUP_DEVICE_WRITE);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+ExecContext *unit_get_exec_context(const Unit *u) {
+ size_t offset;
+ assert(u);
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->exec_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (ExecContext*) ((uint8_t*) u + offset);
+}
+
+KillContext *unit_get_kill_context(Unit *u) {
+ size_t offset;
+ assert(u);
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->kill_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (KillContext*) ((uint8_t*) u + offset);
+}
+
+CGroupContext *unit_get_cgroup_context(Unit *u) {
+ size_t offset;
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->cgroup_context_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return (CGroupContext*) ((uint8_t*) u + offset);
+}
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) {
+ size_t offset;
+
+ if (u->type < 0)
+ return NULL;
+
+ offset = UNIT_VTABLE(u)->exec_runtime_offset;
+ if (offset <= 0)
+ return NULL;
+
+ return *(ExecRuntime**) ((uint8_t*) u + offset);
+}
+
+static const char* unit_drop_in_dir(Unit *u, UnitWriteFlags flags) {
+ assert(u);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return NULL;
+
+ if (u->transient) /* Redirect drop-ins for transient units always into the transient directory. */
+ return u->manager->lookup_paths.transient;
+
+ if (flags & UNIT_PERSISTENT)
+ return u->manager->lookup_paths.persistent_control;
+
+ if (flags & UNIT_RUNTIME)
+ return u->manager->lookup_paths.runtime_control;
+
+ return NULL;
+}
+
+const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf) {
+ assert(s);
+ assert(popcount(flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX | UNIT_ESCAPE_C)) <= 1);
+ assert(buf);
+
+ _cleanup_free_ char *t = NULL;
+
+ /* Returns a string with any escaping done. If no escaping was necessary, *buf is set to NULL, and
+ * the input pointer is returned as-is. If an allocation was needed, the return buffer pointer is
+ * written to *buf. This means the return value always contains a properly escaped version, but *buf
+ * only contains a pointer if an allocation was made. Callers can use this to optimize memory
+ * allocations. */
+
+ if (flags & UNIT_ESCAPE_SPECIFIERS) {
+ t = specifier_escape(s);
+ if (!t)
+ return NULL;
+
+ s = t;
+ }
+
+ /* We either do C-escaping or shell-escaping, to additionally escape characters that we parse for
+ * ExecStart= and friends, i.e. '$' and quotes. */
+
+ if (flags & (UNIT_ESCAPE_EXEC_SYNTAX_ENV | UNIT_ESCAPE_EXEC_SYNTAX)) {
+ char *t2;
+
+ if (flags & UNIT_ESCAPE_EXEC_SYNTAX_ENV) {
+ t2 = strreplace(s, "$", "$$");
+ if (!t2)
+ return NULL;
+ free_and_replace(t, t2);
+ }
+
+ t2 = shell_escape(t ?: s, "\"");
+ if (!t2)
+ return NULL;
+ free_and_replace(t, t2);
+
+ s = t;
+
+ } else if (flags & UNIT_ESCAPE_C) {
+ char *t2;
+
+ t2 = cescape(s);
+ if (!t2)
+ return NULL;
+ free_and_replace(t, t2);
+
+ s = t;
+ }
+
+ *buf = TAKE_PTR(t);
+ return s;
+}
+
+char* unit_concat_strv(char **l, UnitWriteFlags flags) {
+ _cleanup_free_ char *result = NULL;
+ size_t n = 0;
+
+ /* Takes a list of strings, escapes them, and concatenates them. This may be used to format command
+ * lines in a way suitable for ExecStart= stanzas. */
+
+ STRV_FOREACH(i, l) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ size_t a;
+ char *q;
+
+ p = unit_escape_setting(*i, flags, &buf);
+ if (!p)
+ return NULL;
+
+ a = (n > 0) + 1 + strlen(p) + 1; /* separating space + " + entry + " */
+ if (!GREEDY_REALLOC(result, n + a + 1))
+ return NULL;
+
+ q = result + n;
+ if (n > 0)
+ *(q++) = ' ';
+
+ *(q++) = '"';
+ q = stpcpy(q, p);
+ *(q++) = '"';
+
+ n += a;
+ }
+
+ if (!GREEDY_REALLOC(result, n + 1))
+ return NULL;
+
+ result[n] = 0;
+
+ return TAKE_PTR(result);
+}
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data) {
+ _cleanup_free_ char *p = NULL, *q = NULL, *escaped = NULL;
+ const char *dir, *wrapped;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(data);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return 0;
+
+ data = unit_escape_setting(data, flags, &escaped);
+ if (!data)
+ return -ENOMEM;
+
+ /* Prefix the section header. If we are writing this out as transient file, then let's suppress this if the
+ * previous section header is the same */
+
+ if (flags & UNIT_PRIVATE) {
+ if (!UNIT_VTABLE(u)->private_section)
+ return -EINVAL;
+
+ if (!u->transient_file || u->last_section_private < 0)
+ data = strjoina("[", UNIT_VTABLE(u)->private_section, "]\n", data);
+ else if (u->last_section_private == 0)
+ data = strjoina("\n[", UNIT_VTABLE(u)->private_section, "]\n", data);
+ } else {
+ if (!u->transient_file || u->last_section_private < 0)
+ data = strjoina("[Unit]\n", data);
+ else if (u->last_section_private > 0)
+ data = strjoina("\n[Unit]\n", data);
+ }
+
+ if (u->transient_file) {
+ /* When this is a transient unit file in creation, then let's not create a new drop-in but instead
+ * write to the transient unit file. */
+ fputs(data, u->transient_file);
+
+ if (!endswith(data, "\n"))
+ fputc('\n', u->transient_file);
+
+ /* Remember which section we wrote this entry to */
+ u->last_section_private = !!(flags & UNIT_PRIVATE);
+ return 0;
+ }
+
+ dir = unit_drop_in_dir(u, flags);
+ if (!dir)
+ return -EINVAL;
+
+ wrapped = strjoina("# This is a drop-in unit file extension, created via \"systemctl set-property\"\n"
+ "# or an equivalent operation. Do not edit.\n",
+ data,
+ "\n");
+
+ r = drop_in_file(dir, u->id, 50, name, &p, &q);
+ if (r < 0)
+ return r;
+
+ (void) mkdir_p_label(p, 0755);
+
+ /* Make sure the drop-in dir is registered in our path cache. This way we don't need to stupidly
+ * recreate the cache after every drop-in we write. */
+ if (u->manager->unit_path_cache) {
+ r = set_put_strdup(&u->manager->unit_path_cache, p);
+ if (r < 0)
+ return r;
+ }
+
+ r = write_string_file_atomic_label(q, wrapped);
+ if (r < 0)
+ return r;
+
+ r = strv_push(&u->dropin_paths, q);
+ if (r < 0)
+ return r;
+ q = NULL;
+
+ strv_uniq(u->dropin_paths);
+
+ u->dropin_mtime = now(CLOCK_REALTIME);
+
+ return 0;
+}
+
+int unit_write_settingf(Unit *u, UnitWriteFlags flags, const char *name, const char *format, ...) {
+ _cleanup_free_ char *p = NULL;
+ va_list ap;
+ int r;
+
+ assert(u);
+ assert(name);
+ assert(format);
+
+ if (UNIT_WRITE_FLAGS_NOOP(flags))
+ return 0;
+
+ va_start(ap, format);
+ r = vasprintf(&p, format, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ return unit_write_setting(u, flags, name, p);
+}
+
+int unit_make_transient(Unit *u) {
+ _cleanup_free_ char *path = NULL;
+ FILE *f;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_transient)
+ return -EOPNOTSUPP;
+
+ (void) mkdir_p_label(u->manager->lookup_paths.transient, 0755);
+
+ path = path_join(u->manager->lookup_paths.transient, u->id);
+ if (!path)
+ return -ENOMEM;
+
+ /* Let's open the file we'll write the transient settings into. This file is kept open as long as we are
+ * creating the transient, and is closed in unit_load(), as soon as we start loading the file. */
+
+ WITH_UMASK(0022) {
+ f = fopen(path, "we");
+ if (!f)
+ return -errno;
+ }
+
+ safe_fclose(u->transient_file);
+ u->transient_file = f;
+
+ free_and_replace(u->fragment_path, path);
+
+ u->source_path = mfree(u->source_path);
+ u->dropin_paths = strv_free(u->dropin_paths);
+ u->fragment_mtime = u->source_mtime = u->dropin_mtime = 0;
+
+ u->load_state = UNIT_STUB;
+ u->load_error = 0;
+ u->transient = true;
+
+ unit_add_to_dbus_queue(u);
+ unit_add_to_gc_queue(u);
+
+ fputs("# This is a transient unit file, created programmatically via the systemd API. Do not edit.\n",
+ u->transient_file);
+
+ return 0;
+}
+
+static int log_kill(const PidRef *pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ assert(pidref_is_set(pid));
+
+ (void) pidref_get_comm(pid, &comm);
+
+ /* Don't log about processes marked with brackets, under the assumption that these are temporary processes
+ only, like for example systemd's own PAM stub process. */
+ if (comm && comm[0] == '(')
+ /* Although we didn't log anything, as this callback is used in unit_kill_context we must return 1
+ * here to let the manager know that a process was killed. */
+ return 1;
+
+ log_unit_notice(userdata,
+ "Killing process " PID_FMT " (%s) with signal SIG%s.",
+ pid->pid,
+ strna(comm),
+ signal_to_string(sig));
+
+ return 1;
+}
+
+static int operation_to_signal(
+ const KillContext *c,
+ KillOperation k,
+ bool *ret_noteworthy) {
+
+ assert(c);
+
+ switch (k) {
+
+ case KILL_TERMINATE:
+ case KILL_TERMINATE_AND_LOG:
+ *ret_noteworthy = false;
+ return c->kill_signal;
+
+ case KILL_RESTART:
+ *ret_noteworthy = false;
+ return restart_kill_signal(c);
+
+ case KILL_KILL:
+ *ret_noteworthy = true;
+ return c->final_kill_signal;
+
+ case KILL_WATCHDOG:
+ *ret_noteworthy = true;
+ return c->watchdog_signal;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int unit_kill_context(
+ Unit *u,
+ KillContext *c,
+ KillOperation k,
+ PidRef* main_pid,
+ PidRef* control_pid,
+ bool main_pid_alien) {
+
+ bool wait_for_exit = false, send_sighup;
+ cg_kill_log_func_t log_func = NULL;
+ int sig, r;
+
+ assert(u);
+ assert(c);
+
+ /* Kill the processes belonging to this unit, in preparation for shutting the unit down. Returns > 0
+ * if we killed something worth waiting for, 0 otherwise. Do not confuse with unit_kill_common()
+ * which is used for user-requested killing of unit processes. */
+
+ if (c->kill_mode == KILL_NONE)
+ return 0;
+
+ bool noteworthy;
+ sig = operation_to_signal(c, k, &noteworthy);
+ if (noteworthy)
+ log_func = log_kill;
+
+ send_sighup =
+ c->send_sighup &&
+ IN_SET(k, KILL_TERMINATE, KILL_TERMINATE_AND_LOG) &&
+ sig != SIGHUP;
+
+ if (pidref_is_set(main_pid)) {
+ if (log_func)
+ log_func(main_pid, sig, u);
+
+ r = pidref_kill_and_sigcont(main_pid, sig);
+ if (r < 0 && r != -ESRCH) {
+ _cleanup_free_ char *comm = NULL;
+ (void) pidref_get_comm(main_pid, &comm);
+
+ log_unit_warning_errno(u, r, "Failed to kill main process " PID_FMT " (%s), ignoring: %m", main_pid->pid, strna(comm));
+ } else {
+ if (!main_pid_alien)
+ wait_for_exit = true;
+
+ if (r != -ESRCH && send_sighup)
+ (void) pidref_kill(main_pid, SIGHUP);
+ }
+ }
+
+ if (pidref_is_set(control_pid)) {
+ if (log_func)
+ log_func(control_pid, sig, u);
+
+ r = pidref_kill_and_sigcont(control_pid, sig);
+ if (r < 0 && r != -ESRCH) {
+ _cleanup_free_ char *comm = NULL;
+ (void) pidref_get_comm(control_pid, &comm);
+
+ log_unit_warning_errno(u, r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m", control_pid->pid, strna(comm));
+ } else {
+ wait_for_exit = true;
+
+ if (r != -ESRCH && send_sighup)
+ (void) pidref_kill(control_pid, SIGHUP);
+ }
+ }
+
+ if (u->cgroup_path &&
+ (c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL))) {
+ _cleanup_set_free_ Set *pid_set = NULL;
+
+ /* Exclude the main/control pids from being killed via the cgroup */
+ pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+ if (!pid_set)
+ return -ENOMEM;
+
+ r = cg_kill_recursive(
+ u->cgroup_path,
+ sig,
+ CGROUP_SIGCONT|CGROUP_IGNORE_SELF,
+ pid_set,
+ log_func, u);
+ if (r < 0) {
+ if (!IN_SET(r, -EAGAIN, -ESRCH, -ENOENT))
+ log_unit_warning_errno(u, r, "Failed to kill control group %s, ignoring: %m", empty_to_root(u->cgroup_path));
+
+ } else if (r > 0) {
+
+ /* FIXME: For now, on the legacy hierarchy, we will not wait for the cgroup members to die if
+ * we are running in a container or if this is a delegation unit, simply because cgroup
+ * notification is unreliable in these cases. It doesn't work at all in containers, and outside
+ * of containers it can be confused easily by left-over directories in the cgroup — which
+ * however should not exist in non-delegated units. On the unified hierarchy that's different,
+ * there we get proper events. Hence rely on them. */
+
+ if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0 ||
+ (detect_container() == 0 && !unit_cgroup_delegate(u)))
+ wait_for_exit = true;
+
+ if (send_sighup) {
+ set_free(pid_set);
+
+ pid_set = unit_pid_set(main_pid ? main_pid->pid : 0, control_pid ? control_pid->pid : 0);
+ if (!pid_set)
+ return -ENOMEM;
+
+ (void) cg_kill_recursive(
+ u->cgroup_path,
+ SIGHUP,
+ CGROUP_IGNORE_SELF,
+ pid_set,
+ /* kill_log= */ NULL,
+ /* userdata= */ NULL);
+ }
+ }
+ }
+
+ return wait_for_exit;
+}
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask) {
+ int r;
+
+ assert(u);
+ assert(path);
+
+ /* Registers a unit for requiring a certain path and all its prefixes. We keep a hashtable of these
+ * paths in the unit (from the path to the UnitDependencyInfo structure indicating how to the
+ * dependency came to be). However, we build a prefix table for all possible prefixes so that new
+ * appearing mount units can easily determine which units to make themselves a dependency of. */
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ if (hashmap_contains(u->requires_mounts_for, path)) /* Exit quickly if the path is already covered. */
+ return 0;
+
+ /* Use the canonical form of the path as the stored key. We call path_is_normalized()
+ * only after simplification, since path_is_normalized() rejects paths with '.'.
+ * path_is_normalized() also verifies that the path fits in PATH_MAX. */
+ _cleanup_free_ char *p = NULL;
+ r = path_simplify_alloc(path, &p);
+ if (r < 0)
+ return r;
+ path = p;
+
+ if (!path_is_normalized(path))
+ return -EPERM;
+
+ UnitDependencyInfo di = {
+ .origin_mask = mask
+ };
+
+ r = hashmap_ensure_put(&u->requires_mounts_for, &path_hash_ops, p, di.data);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+ TAKE_PTR(p); /* path remains a valid pointer to the string stored in the hashmap */
+
+ char prefix[strlen(path) + 1];
+ PATH_FOREACH_PREFIX_MORE(prefix, path) {
+ Set *x;
+
+ x = hashmap_get(u->manager->units_requiring_mounts_for, prefix);
+ if (!x) {
+ _cleanup_free_ char *q = NULL;
+
+ r = hashmap_ensure_allocated(&u->manager->units_requiring_mounts_for, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ q = strdup(prefix);
+ if (!q)
+ return -ENOMEM;
+
+ x = set_new(NULL);
+ if (!x)
+ return -ENOMEM;
+
+ r = hashmap_put(u->manager->units_requiring_mounts_for, q, x);
+ if (r < 0) {
+ set_free(x);
+ return r;
+ }
+ q = NULL;
+ }
+
+ r = set_put(x, u);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_setup_exec_runtime(Unit *u) {
+ _cleanup_(exec_shared_runtime_unrefp) ExecSharedRuntime *esr = NULL;
+ _cleanup_(dynamic_creds_unrefp) DynamicCreds *dcreds = NULL;
+ _cleanup_set_free_ Set *units = NULL;
+ ExecRuntime **rt;
+ ExecContext *ec;
+ size_t offset;
+ Unit *other;
+ int r;
+
+ offset = UNIT_VTABLE(u)->exec_runtime_offset;
+ assert(offset > 0);
+
+ /* Check if there already is an ExecRuntime for this unit? */
+ rt = (ExecRuntime**) ((uint8_t*) u + offset);
+ if (*rt)
+ return 0;
+
+ ec = unit_get_exec_context(u);
+ assert(ec);
+
+ r = unit_get_transitive_dependency_set(u, UNIT_ATOM_JOINS_NAMESPACE_OF, &units);
+ if (r < 0)
+ return r;
+
+ /* Try to get it from somebody else */
+ SET_FOREACH(other, units) {
+ r = exec_shared_runtime_acquire(u->manager, NULL, other->id, false, &esr);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ break;
+ }
+
+ if (!esr) {
+ r = exec_shared_runtime_acquire(u->manager, ec, u->id, true, &esr);
+ if (r < 0)
+ return r;
+ }
+
+ if (ec->dynamic_user) {
+ r = dynamic_creds_make(u->manager, ec->user, ec->group, &dcreds);
+ if (r < 0)
+ return r;
+ }
+
+ r = exec_runtime_make(u, ec, esr, dcreds, rt);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(esr);
+ TAKE_PTR(dcreds);
+
+ return r;
+}
+
+bool unit_type_supported(UnitType t) {
+ static int8_t cache[_UNIT_TYPE_MAX] = {}; /* -1: disabled, 1: enabled: 0: don't know */
+ int r;
+
+ assert(t >= 0 && t < _UNIT_TYPE_MAX);
+
+ if (cache[t] == 0) {
+ char *e;
+
+ e = strjoina("SYSTEMD_SUPPORT_", unit_type_to_string(t));
+
+ r = getenv_bool(ascii_strupper(e));
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $%s, ignoring: %m", e);
+
+ cache[t] = r == 0 ? -1 : 1;
+ }
+ if (cache[t] < 0)
+ return false;
+
+ if (!unit_vtable[t]->supported)
+ return true;
+
+ return unit_vtable[t]->supported();
+}
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where) {
+ int r;
+
+ assert(u);
+ assert(where);
+
+ if (!unit_log_level_test(u, LOG_NOTICE))
+ return;
+
+ r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
+ if (r > 0 || r == -ENOTDIR)
+ return;
+ if (r < 0) {
+ log_unit_warning_errno(u, r, "Failed to check directory %s: %m", where);
+ return;
+ }
+
+ log_unit_struct(u, LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where),
+ "WHERE=%s", where);
+}
+
+int unit_fail_if_noncanonical(Unit *u, const char* where) {
+ _cleanup_free_ char *canonical_where = NULL;
+ int r;
+
+ assert(u);
+ assert(where);
+
+ r = chase(where, NULL, CHASE_NONEXISTENT, &canonical_where, NULL);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to check %s for symlinks, ignoring: %m", where);
+ return 0;
+ }
+
+ /* We will happily ignore a trailing slash (or any redundant slashes) */
+ if (path_equal(where, canonical_where))
+ return 0;
+
+ /* No need to mention "." or "..", they would already have been rejected by unit_name_from_path() */
+ log_unit_struct(u, LOG_ERR,
+ "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Mount path %s is not canonical (contains a symlink).", where),
+ "WHERE=%s", where);
+
+ return -ELOOP;
+}
+
+bool unit_is_pristine(Unit *u) {
+ assert(u);
+
+ /* Check if the unit already exists or is already around, in a number of different ways. Note that to
+ * cater for unit types such as slice, we are generally fine with units that are marked UNIT_LOADED
+ * even though nothing was actually loaded, as those unit types don't require a file on disk.
+ *
+ * Note that we don't check for drop-ins here, because we allow drop-ins for transient units
+ * identically to non-transient units, both unit-specific and hierarchical. E.g. for a-b-c.service:
+ * service.d/….conf, a-.service.d/….conf, a-b-.service.d/….conf, a-b-c.service.d/….conf.
+ */
+
+ return IN_SET(u->load_state, UNIT_NOT_FOUND, UNIT_LOADED) &&
+ !u->fragment_path &&
+ !u->source_path &&
+ !u->job &&
+ !u->merged_into;
+}
+
+PidRef* unit_control_pid(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->control_pid)
+ return UNIT_VTABLE(u)->control_pid(u);
+
+ return NULL;
+}
+
+PidRef* unit_main_pid(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->main_pid)
+ return UNIT_VTABLE(u)->main_pid(u);
+
+ return NULL;
+}
+
+static void unit_modify_user_nft_set(Unit *u, bool add, NFTSetSource source, uint32_t element) {
+ int r;
+
+ assert(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ CGroupContext *c;
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return;
+
+ if (!u->manager->fw_ctx) {
+ r = fw_ctx_new_full(&u->manager->fw_ctx, /* init_tables= */ false);
+ if (r < 0)
+ return;
+
+ assert(u->manager->fw_ctx);
+ }
+
+ FOREACH_ARRAY(nft_set, c->nft_set_context.sets, c->nft_set_context.n_sets) {
+ if (nft_set->source != source)
+ continue;
+
+ r = nft_set_element_modify_any(u->manager->fw_ctx, add, nft_set->nfproto, nft_set->table, nft_set->set, &element, sizeof(element));
+ if (r < 0)
+ log_warning_errno(r, "Failed to %s NFT set: family %s, table %s, set %s, ID %u, ignoring: %m",
+ add? "add" : "delete", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element);
+ else
+ log_debug("%s NFT set: family %s, table %s, set %s, ID %u",
+ add? "Added" : "Deleted", nfproto_to_string(nft_set->nfproto), nft_set->table, nft_set->set, element);
+ }
+}
+
+static void unit_unref_uid_internal(
+ Unit *u,
+ uid_t *ref_uid,
+ bool destroy_now,
+ void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) {
+
+ assert(u);
+ assert(ref_uid);
+ assert(_manager_unref_uid);
+
+ /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and
+ * gid_t are actually the same time, with the same validity rules.
+ *
+ * Drops a reference to UID/GID from a unit. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (!uid_is_valid(*ref_uid))
+ return;
+
+ _manager_unref_uid(u->manager, *ref_uid, destroy_now);
+ *ref_uid = UID_INVALID;
+}
+
+static void unit_unref_uid(Unit *u, bool destroy_now) {
+ assert(u);
+
+ unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_USER, u->ref_uid);
+
+ unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid);
+}
+
+static void unit_unref_gid(Unit *u, bool destroy_now) {
+ assert(u);
+
+ unit_modify_user_nft_set(u, /* add = */ false, NFT_SET_SOURCE_GROUP, u->ref_gid);
+
+ unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid);
+}
+
+void unit_unref_uid_gid(Unit *u, bool destroy_now) {
+ assert(u);
+
+ unit_unref_uid(u, destroy_now);
+ unit_unref_gid(u, destroy_now);
+}
+
+static int unit_ref_uid_internal(
+ Unit *u,
+ uid_t *ref_uid,
+ uid_t uid,
+ bool clean_ipc,
+ int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) {
+
+ int r;
+
+ assert(u);
+ assert(ref_uid);
+ assert(uid_is_valid(uid));
+ assert(_manager_ref_uid);
+
+ /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t
+ * are actually the same type, and have the same validity rules.
+ *
+ * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a
+ * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter
+ * drops to zero. */
+
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+ assert_cc(UID_INVALID == (uid_t) GID_INVALID);
+
+ if (*ref_uid == uid)
+ return 0;
+
+ if (uid_is_valid(*ref_uid)) /* Already set? */
+ return -EBUSY;
+
+ r = _manager_ref_uid(u->manager, uid, clean_ipc);
+ if (r < 0)
+ return r;
+
+ *ref_uid = uid;
+ return 1;
+}
+
+static int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) {
+ return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid);
+}
+
+static int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) {
+ return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid);
+}
+
+static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) {
+ int r = 0, q = 0;
+
+ assert(u);
+
+ /* Reference both a UID and a GID in one go. Either references both, or neither. */
+
+ if (uid_is_valid(uid)) {
+ r = unit_ref_uid(u, uid, clean_ipc);
+ if (r < 0)
+ return r;
+ }
+
+ if (gid_is_valid(gid)) {
+ q = unit_ref_gid(u, gid, clean_ipc);
+ if (q < 0) {
+ if (r > 0)
+ unit_unref_uid(u, false);
+
+ return q;
+ }
+ }
+
+ return r > 0 || q > 0;
+}
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) {
+ ExecContext *c;
+ int r;
+
+ assert(u);
+
+ c = unit_get_exec_context(u);
+
+ r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false);
+ if (r < 0)
+ return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m");
+
+ unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_USER, uid);
+ unit_modify_user_nft_set(u, /* add = */ true, NFT_SET_SOURCE_GROUP, gid);
+
+ return r;
+}
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) {
+ int r;
+
+ assert(u);
+
+ /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names
+ * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC
+ * objects when no service references the UID/GID anymore. */
+
+ r = unit_ref_uid_gid(u, uid, gid);
+ if (r > 0)
+ unit_add_to_dbus_queue(u);
+}
+
+int unit_acquire_invocation_id(Unit *u) {
+ sd_id128_t id;
+ int r;
+
+ assert(u);
+
+ r = sd_id128_randomize(&id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m");
+
+ r = unit_set_invocation_id(u, id);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m");
+
+ unit_add_to_dbus_queue(u);
+ return 0;
+}
+
+int unit_set_exec_params(Unit *u, ExecParameters *p) {
+ const char *confirm_spawn;
+ int r;
+
+ assert(u);
+ assert(p);
+
+ /* Copy parameters from manager */
+ r = manager_get_effective_environment(u->manager, &p->environment);
+ if (r < 0)
+ return r;
+
+ p->runtime_scope = u->manager->runtime_scope;
+
+ confirm_spawn = manager_get_confirm_spawn(u->manager);
+ if (confirm_spawn) {
+ p->confirm_spawn = strdup(confirm_spawn);
+ if (!p->confirm_spawn)
+ return -ENOMEM;
+ }
+
+ p->cgroup_supported = u->manager->cgroup_supported;
+ p->prefix = u->manager->prefix;
+ SET_FLAG(p->flags, EXEC_PASS_LOG_UNIT|EXEC_CHOWN_DIRECTORIES, MANAGER_IS_SYSTEM(u->manager));
+
+ /* Copy parameters from unit */
+ p->cgroup_path = u->cgroup_path;
+ SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, unit_cgroup_delegate(u));
+
+ p->received_credentials_directory = u->manager->received_credentials_directory;
+ p->received_encrypted_credentials_directory = u->manager->received_encrypted_credentials_directory;
+
+ p->shall_confirm_spawn = u->manager->confirm_spawn;
+
+ p->fallback_smack_process_label = u->manager->defaults.smack_process_label;
+
+ if (u->manager->restrict_fs && p->bpf_outer_map_fd < 0) {
+ int fd = lsm_bpf_map_restrict_fs_fd(u);
+ if (fd < 0)
+ return fd;
+
+ p->bpf_outer_map_fd = fd;
+ }
+
+ p->user_lookup_fd = u->manager->user_lookup_fds[1];
+
+ p->cgroup_id = u->cgroup_id;
+ p->invocation_id = u->invocation_id;
+ sd_id128_to_string(p->invocation_id, p->invocation_id_string);
+ p->unit_id = strdup(u->id);
+ if (!p->unit_id)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret) {
+ pid_t pid;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child,
+ * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */
+
+ (void) unit_realize_cgroup(u);
+
+ r = safe_fork(name, FORK_REOPEN_LOG|FORK_DEATHSIG_SIGTERM, &pid);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ int q;
+
+ /* Parent */
+
+ q = pidref_set_pid(&pidref, pid);
+ if (q < 0)
+ return q;
+
+ *ret = TAKE_PIDREF(pidref);
+ return r;
+ }
+
+ /* Child */
+
+ (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE);
+ (void) ignore_signals(SIGPIPE);
+
+ if (u->cgroup_path) {
+ r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", empty_to_root(u->cgroup_path));
+ _exit(EXIT_CGROUP);
+ }
+ }
+
+ return 0;
+}
+
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret_pid) {
+ _cleanup_(pidref_done) PidRef pid = PIDREF_NULL;
+ int r;
+
+ assert(u);
+ assert(ret_pid);
+
+ r = unit_fork_helper_process(u, "(sd-rmrf)", &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ int ret = EXIT_SUCCESS;
+
+ STRV_FOREACH(i, paths) {
+ r = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK);
+ if (r < 0) {
+ log_error_errno(r, "Failed to remove '%s': %m", *i);
+ ret = EXIT_FAILURE;
+ }
+ }
+
+ _exit(ret);
+ }
+
+ r = unit_watch_pidref(u, &pid, /* exclusive= */ true);
+ if (r < 0)
+ return r;
+
+ *ret_pid = TAKE_PIDREF(pid);
+ return 0;
+}
+
+static void unit_update_dependency_mask(Hashmap *deps, Unit *other, UnitDependencyInfo di) {
+ assert(deps);
+ assert(other);
+
+ if (di.origin_mask == 0 && di.destination_mask == 0)
+ /* No bit set anymore, let's drop the whole entry */
+ assert_se(hashmap_remove(deps, other));
+ else
+ /* Mask was reduced, let's update the entry */
+ assert_se(hashmap_update(deps, other, di.data) == 0);
+}
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask) {
+ Hashmap *deps;
+ assert(u);
+
+ /* Removes all dependencies u has on other units marked for ownership by 'mask'. */
+
+ if (mask == 0)
+ return;
+
+ HASHMAP_FOREACH(deps, u->dependencies) {
+ bool done;
+
+ do {
+ UnitDependencyInfo di;
+ Unit *other;
+
+ done = true;
+
+ HASHMAP_FOREACH_KEY(di.data, other, deps) {
+ Hashmap *other_deps;
+
+ if (FLAGS_SET(~mask, di.origin_mask))
+ continue;
+
+ di.origin_mask &= ~mask;
+ unit_update_dependency_mask(deps, other, di);
+
+ /* We updated the dependency from our unit to the other unit now. But most
+ * dependencies imply a reverse dependency. Hence, let's delete that one
+ * too. For that we go through all dependency types on the other unit and
+ * delete all those which point to us and have the right mask set. */
+
+ HASHMAP_FOREACH(other_deps, other->dependencies) {
+ UnitDependencyInfo dj;
+
+ dj.data = hashmap_get(other_deps, u);
+ if (FLAGS_SET(~mask, dj.destination_mask))
+ continue;
+
+ dj.destination_mask &= ~mask;
+ unit_update_dependency_mask(other_deps, u, dj);
+ }
+
+ unit_add_to_gc_queue(other);
+
+ /* The unit 'other' may not be wanted by the unit 'u'. */
+ unit_submit_to_stop_when_unneeded_queue(other);
+
+ done = false;
+ break;
+ }
+
+ } while (!done);
+ }
+}
+
+static int unit_get_invocation_path(Unit *u, char **ret) {
+ char *p;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ if (MANAGER_IS_SYSTEM(u->manager))
+ p = strjoin("/run/systemd/units/invocation:", u->id);
+ else {
+ _cleanup_free_ char *user_path = NULL;
+ r = xdg_user_runtime_dir(&user_path, "/systemd/units/invocation:");
+ if (r < 0)
+ return r;
+ p = strjoin(user_path, u->id);
+ }
+
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return 0;
+}
+
+static int unit_export_invocation_id(Unit *u) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(u);
+
+ if (u->exported_invocation_id)
+ return 0;
+
+ if (sd_id128_is_null(u->invocation_id))
+ return 0;
+
+ r = unit_get_invocation_path(u, &p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to get invocation path: %m");
+
+ r = symlink_atomic_label(u->invocation_id_string, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create invocation ID symlink %s: %m", p);
+
+ u->exported_invocation_id = true;
+ return 0;
+}
+
+static int unit_export_log_level_max(Unit *u, const ExecContext *c) {
+ const char *p;
+ char buf[2];
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_level_max)
+ return 0;
+
+ if (c->log_level_max < 0)
+ return 0;
+
+ assert(c->log_level_max <= 7);
+
+ buf[0] = '0' + c->log_level_max;
+ buf[1] = 0;
+
+ p = strjoina("/run/systemd/units/log-level-max:", u->id);
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create maximum log level symlink %s: %m", p);
+
+ u->exported_log_level_max = true;
+ return 0;
+}
+
+static int unit_export_log_extra_fields(Unit *u, const ExecContext *c) {
+ _cleanup_close_ int fd = -EBADF;
+ struct iovec *iovec;
+ const char *p;
+ char *pattern;
+ le64_t *sizes;
+ ssize_t n;
+ int r;
+
+ if (u->exported_log_extra_fields)
+ return 0;
+
+ if (c->n_log_extra_fields <= 0)
+ return 0;
+
+ sizes = newa(le64_t, c->n_log_extra_fields);
+ iovec = newa(struct iovec, c->n_log_extra_fields * 2);
+
+ for (size_t i = 0; i < c->n_log_extra_fields; i++) {
+ sizes[i] = htole64(c->log_extra_fields[i].iov_len);
+
+ iovec[i*2] = IOVEC_MAKE(sizes + i, sizeof(le64_t));
+ iovec[i*2+1] = c->log_extra_fields[i];
+ }
+
+ p = strjoina("/run/systemd/units/log-extra-fields:", u->id);
+ pattern = strjoina(p, ".XXXXXX");
+
+ fd = mkostemp_safe(pattern);
+ if (fd < 0)
+ return log_unit_debug_errno(u, fd, "Failed to create extra fields file %s: %m", p);
+
+ n = writev(fd, iovec, c->n_log_extra_fields*2);
+ if (n < 0) {
+ r = log_unit_debug_errno(u, errno, "Failed to write extra fields: %m");
+ goto fail;
+ }
+
+ (void) fchmod(fd, 0644);
+
+ if (rename(pattern, p) < 0) {
+ r = log_unit_debug_errno(u, errno, "Failed to rename extra fields file: %m");
+ goto fail;
+ }
+
+ u->exported_log_extra_fields = true;
+ return 0;
+
+fail:
+ (void) unlink(pattern);
+ return r;
+}
+
+static int unit_export_log_ratelimit_interval(Unit *u, const ExecContext *c) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_ratelimit_interval)
+ return 0;
+
+ if (c->log_ratelimit_interval_usec == 0)
+ return 0;
+
+ p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+
+ if (asprintf(&buf, "%" PRIu64, c->log_ratelimit_interval_usec) < 0)
+ return log_oom();
+
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create log rate limit interval symlink %s: %m", p);
+
+ u->exported_log_ratelimit_interval = true;
+ return 0;
+}
+
+static int unit_export_log_ratelimit_burst(Unit *u, const ExecContext *c) {
+ _cleanup_free_ char *buf = NULL;
+ const char *p;
+ int r;
+
+ assert(u);
+ assert(c);
+
+ if (u->exported_log_ratelimit_burst)
+ return 0;
+
+ if (c->log_ratelimit_burst == 0)
+ return 0;
+
+ p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+
+ if (asprintf(&buf, "%u", c->log_ratelimit_burst) < 0)
+ return log_oom();
+
+ r = symlink_atomic(buf, p);
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Failed to create log rate limit burst symlink %s: %m", p);
+
+ u->exported_log_ratelimit_burst = true;
+ return 0;
+}
+
+void unit_export_state_files(Unit *u) {
+ const ExecContext *c;
+
+ assert(u);
+
+ if (!u->id)
+ return;
+
+ if (MANAGER_IS_TEST_RUN(u->manager))
+ return;
+
+ /* Exports a couple of unit properties to /run/systemd/units/, so that journald can quickly query this data
+ * from there. Ideally, journald would use IPC to query this, like everybody else, but that's hard, as long as
+ * the IPC system itself and PID 1 also log to the journal.
+ *
+ * Note that these files really shouldn't be considered API for anyone else, as use a runtime file system as
+ * IPC replacement is not compatible with today's world of file system namespaces. However, this doesn't really
+ * apply to communication between the journal and systemd, as we assume that these two daemons live in the same
+ * namespace at least.
+ *
+ * Note that some of the "files" exported here are actually symlinks and not regular files. Symlinks work
+ * better for storing small bits of data, in particular as we can write them with two system calls, and read
+ * them with one. */
+
+ (void) unit_export_invocation_id(u);
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ c = unit_get_exec_context(u);
+ if (c) {
+ (void) unit_export_log_level_max(u, c);
+ (void) unit_export_log_extra_fields(u, c);
+ (void) unit_export_log_ratelimit_interval(u, c);
+ (void) unit_export_log_ratelimit_burst(u, c);
+ }
+}
+
+void unit_unlink_state_files(Unit *u) {
+ const char *p;
+
+ assert(u);
+
+ if (!u->id)
+ return;
+
+ /* Undoes the effect of unit_export_state() */
+
+ if (u->exported_invocation_id) {
+ _cleanup_free_ char *invocation_path = NULL;
+ int r = unit_get_invocation_path(u, &invocation_path);
+ if (r >= 0) {
+ (void) unlink(invocation_path);
+ u->exported_invocation_id = false;
+ }
+ }
+
+ if (!MANAGER_IS_SYSTEM(u->manager))
+ return;
+
+ if (u->exported_log_level_max) {
+ p = strjoina("/run/systemd/units/log-level-max:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_level_max = false;
+ }
+
+ if (u->exported_log_extra_fields) {
+ p = strjoina("/run/systemd/units/extra-fields:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_extra_fields = false;
+ }
+
+ if (u->exported_log_ratelimit_interval) {
+ p = strjoina("/run/systemd/units/log-rate-limit-interval:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_ratelimit_interval = false;
+ }
+
+ if (u->exported_log_ratelimit_burst) {
+ p = strjoina("/run/systemd/units/log-rate-limit-burst:", u->id);
+ (void) unlink(p);
+
+ u->exported_log_ratelimit_burst = false;
+ }
+}
+
+int unit_prepare_exec(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
+ * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
+ r = bpf_firewall_load_custom(u);
+ if (r < 0)
+ return r;
+
+ /* Prepares everything so that we can fork of a process for this unit */
+
+ (void) unit_realize_cgroup(u);
+
+ if (u->reset_accounting) {
+ (void) unit_reset_accounting(u);
+ u->reset_accounting = false;
+ }
+
+ unit_export_state_files(u);
+
+ r = unit_setup_exec_runtime(u);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static bool ignore_leftover_process(const char *comm) {
+ return comm && comm[0] == '('; /* Most likely our own helper process (PAM?), ignore */
+}
+
+int unit_log_leftover_process_start(const PidRef *pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ assert(pidref_is_set(pid));
+
+ (void) pidref_get_comm(pid, &comm);
+
+ if (ignore_leftover_process(comm))
+ return 0;
+
+ /* During start we print a warning */
+
+ log_unit_warning(userdata,
+ "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n"
+ "This usually indicates unclean termination of a previous run, or service implementation deficiencies.",
+ pid->pid, strna(comm));
+
+ return 1;
+}
+
+int unit_log_leftover_process_stop(const PidRef *pid, int sig, void *userdata) {
+ _cleanup_free_ char *comm = NULL;
+
+ assert(pidref_is_set(pid));
+
+ (void) pidref_get_comm(pid, &comm);
+
+ if (ignore_leftover_process(comm))
+ return 0;
+
+ /* During stop we only print an informational message */
+
+ log_unit_info(userdata,
+ "Unit process " PID_FMT " (%s) remains running after unit stopped.",
+ pid->pid, strna(comm));
+
+ return 1;
+}
+
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func) {
+ assert(u);
+
+ (void) unit_pick_cgroup_path(u);
+
+ if (!u->cgroup_path)
+ return 0;
+
+ return cg_kill_recursive(
+ u->cgroup_path,
+ /* sig= */ 0,
+ /* flags= */ 0,
+ /* set= */ NULL,
+ log_func,
+ u);
+}
+
+bool unit_needs_console(Unit *u) {
+ ExecContext *ec;
+ UnitActiveState state;
+
+ assert(u);
+
+ state = unit_active_state(u);
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(state))
+ return false;
+
+ if (UNIT_VTABLE(u)->needs_console)
+ return UNIT_VTABLE(u)->needs_console(u);
+
+ /* If this unit type doesn't implement this call, let's use a generic fallback implementation: */
+ ec = unit_get_exec_context(u);
+ if (!ec)
+ return false;
+
+ return exec_context_may_touch_console(ec);
+}
+
+int unit_pid_attachable(Unit *u, PidRef *pid, sd_bus_error *error) {
+ int r;
+
+ assert(u);
+
+ /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself,
+ * and not a kernel thread either */
+
+ /* First, a simple range check */
+ if (!pidref_is_set(pid))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier is not valid.");
+
+ /* Some extra safety check */
+ if (pid->pid == 1 || pidref_is_self(pid))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager process, refusing.", pid->pid);
+
+ /* Don't even begin to bother with kernel threads */
+ r = pidref_is_kernel_thread(pid);
+ if (r == -ESRCH)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid->pid);
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid->pid);
+ if (r > 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid->pid);
+
+ return 0;
+}
+
+void unit_log_success(Unit *u) {
+ assert(u);
+
+ /* Let's show message "Deactivated successfully" in debug mode (when manager is user) rather than in info mode.
+ * This message has low information value for regular users and it might be a bit overwhelming on a system with
+ * a lot of devices. */
+ log_unit_struct(u,
+ MANAGER_IS_USER(u->manager) ? LOG_DEBUG : LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_SUCCESS_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Deactivated successfully."));
+}
+
+void unit_log_failure(Unit *u, const char *result) {
+ assert(u);
+ assert(result);
+
+ log_unit_struct(u, LOG_WARNING,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_FAILURE_RESULT_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Failed with result '%s'.", result),
+ "UNIT_RESULT=%s", result);
+}
+
+void unit_log_skip(Unit *u, const char *result) {
+ assert(u);
+ assert(result);
+
+ log_unit_struct(u, LOG_INFO,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_SKIPPED_STR,
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "Skipped due to '%s'.", result),
+ "UNIT_RESULT=%s", result);
+}
+
+void unit_log_process_exit(
+ Unit *u,
+ const char *kind,
+ const char *command,
+ bool success,
+ int code,
+ int status) {
+
+ int level;
+
+ assert(u);
+ assert(kind);
+
+ /* If this is a successful exit, let's log about the exit code on DEBUG level. If this is a failure
+ * and the process exited on its own via exit(), then let's make this a NOTICE, under the assumption
+ * that the service already logged the reason at a higher log level on its own. Otherwise, make it a
+ * WARNING. */
+ if (success)
+ level = LOG_DEBUG;
+ else if (code == CLD_EXITED)
+ level = LOG_NOTICE;
+ else
+ level = LOG_WARNING;
+
+ log_unit_struct(u, level,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_PROCESS_EXIT_STR,
+ LOG_UNIT_MESSAGE(u, "%s exited, code=%s, status=%i/%s%s",
+ kind,
+ sigchld_code_to_string(code), status,
+ strna(code == CLD_EXITED
+ ? exit_status_to_string(status, EXIT_STATUS_FULL)
+ : signal_to_string(status)),
+ success ? " (success)" : ""),
+ "EXIT_CODE=%s", sigchld_code_to_string(code),
+ "EXIT_STATUS=%i", status,
+ "COMMAND=%s", strna(command),
+ LOG_UNIT_INVOCATION_ID(u));
+}
+
+int unit_exit_status(Unit *u) {
+ assert(u);
+
+ /* Returns the exit status to propagate for the most recent cycle of this unit. Returns a value in the range
+ * 0…255 if there's something to propagate. EOPNOTSUPP if the concept does not apply to this unit type, ENODATA
+ * if no data is currently known (for example because the unit hasn't deactivated yet) and EBADE if the main
+ * service process has exited abnormally (signal/coredump). */
+
+ if (!UNIT_VTABLE(u)->exit_status)
+ return -EOPNOTSUPP;
+
+ return UNIT_VTABLE(u)->exit_status(u);
+}
+
+int unit_failure_action_exit_status(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Returns the exit status to propagate on failure, or an error if there's nothing to propagate */
+
+ if (u->failure_action_exit_status >= 0)
+ return u->failure_action_exit_status;
+
+ r = unit_exit_status(u);
+ if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+ return 255;
+
+ return r;
+}
+
+int unit_success_action_exit_status(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Returns the exit status to propagate on success, or an error if there's nothing to propagate */
+
+ if (u->success_action_exit_status >= 0)
+ return u->success_action_exit_status;
+
+ r = unit_exit_status(u);
+ if (r == -EBADE) /* Exited, but not cleanly (i.e. by signal or such) */
+ return 255;
+
+ return r;
+}
+
+int unit_test_trigger_loaded(Unit *u) {
+ Unit *trigger;
+
+ /* Tests whether the unit to trigger is loaded */
+
+ trigger = UNIT_TRIGGER(u);
+ if (!trigger)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+ "Refusing to start, no unit to trigger.");
+ if (trigger->load_state != UNIT_LOADED)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOENT),
+ "Refusing to start, unit %s to trigger not loaded.", trigger->id);
+
+ return 0;
+}
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context) {
+ assert(u);
+ assert(context);
+
+ /* EXEC_PRESERVE_RESTART is handled via unit_release_resources()! */
+ if (context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
+ exec_context_destroy_runtime_directory(context, u->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+
+ exec_context_destroy_credentials(u);
+ exec_context_destroy_mount_ns_dir(u);
+}
+
+int unit_clean(Unit *u, ExecCleanMask mask) {
+ UnitActiveState state;
+
+ assert(u);
+
+ /* Special return values:
+ *
+ * -EOPNOTSUPP → cleaning not supported for this unit type
+ * -EUNATCH → cleaning not defined for this resource type
+ * -EBUSY → unit currently can't be cleaned since it's running or not properly loaded, or has
+ * a job queued or similar
+ */
+
+ if (!UNIT_VTABLE(u)->clean)
+ return -EOPNOTSUPP;
+
+ if (mask == 0)
+ return -EUNATCH;
+
+ if (u->load_state != UNIT_LOADED)
+ return -EBUSY;
+
+ if (u->job)
+ return -EBUSY;
+
+ state = unit_active_state(u);
+ if (state != UNIT_INACTIVE)
+ return -EBUSY;
+
+ return UNIT_VTABLE(u)->clean(u, mask);
+}
+
+int unit_can_clean(Unit *u, ExecCleanMask *ret) {
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->clean ||
+ u->load_state != UNIT_LOADED) {
+ *ret = 0;
+ return 0;
+ }
+
+ /* When the clean() method is set, can_clean() really should be set too */
+ assert(UNIT_VTABLE(u)->can_clean);
+
+ return UNIT_VTABLE(u)->can_clean(u, ret);
+}
+
+bool unit_can_start_refuse_manual(Unit *u) {
+ return unit_can_start(u) && !u->refuse_manual_start;
+}
+
+bool unit_can_stop_refuse_manual(Unit *u) {
+ return unit_can_stop(u) && !u->refuse_manual_stop;
+}
+
+bool unit_can_isolate_refuse_manual(Unit *u) {
+ return unit_can_isolate(u) && !u->refuse_manual_start;
+}
+
+bool unit_can_freeze(Unit *u) {
+ assert(u);
+
+ if (UNIT_VTABLE(u)->can_freeze)
+ return UNIT_VTABLE(u)->can_freeze(u);
+
+ return UNIT_VTABLE(u)->freeze;
+}
+
+void unit_frozen(Unit *u) {
+ assert(u);
+
+ u->freezer_state = FREEZER_FROZEN;
+
+ bus_unit_send_pending_freezer_message(u, false);
+}
+
+void unit_thawed(Unit *u) {
+ assert(u);
+
+ u->freezer_state = FREEZER_RUNNING;
+
+ bus_unit_send_pending_freezer_message(u, false);
+}
+
+static int unit_freezer_action(Unit *u, FreezerAction action) {
+ UnitActiveState s;
+ int (*method)(Unit*);
+ int r;
+
+ assert(u);
+ assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
+
+ method = action == FREEZER_FREEZE ? UNIT_VTABLE(u)->freeze : UNIT_VTABLE(u)->thaw;
+ if (!method || !cg_freezer_supported())
+ return -EOPNOTSUPP;
+
+ if (u->job)
+ return -EBUSY;
+
+ if (u->load_state != UNIT_LOADED)
+ return -EHOSTDOWN;
+
+ s = unit_active_state(u);
+ if (s != UNIT_ACTIVE)
+ return -EHOSTDOWN;
+
+ if ((IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING) && action == FREEZER_FREEZE) ||
+ (u->freezer_state == FREEZER_THAWING && action == FREEZER_THAW))
+ return -EALREADY;
+
+ r = method(u);
+ if (r <= 0)
+ return r;
+
+ assert(IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING));
+
+ return 1;
+}
+
+int unit_freeze(Unit *u) {
+ return unit_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw(Unit *u) {
+ return unit_freezer_action(u, FREEZER_THAW);
+}
+
+/* Wrappers around low-level cgroup freezer operations common for service and scope units */
+int unit_freeze_vtable_common(Unit *u) {
+ return unit_cgroup_freezer_action(u, FREEZER_FREEZE);
+}
+
+int unit_thaw_vtable_common(Unit *u) {
+ return unit_cgroup_freezer_action(u, FREEZER_THAW);
+}
+
+Condition *unit_find_failed_condition(Unit *u) {
+ Condition *failed_trigger = NULL;
+ bool has_succeeded_trigger = false;
+
+ if (u->condition_result)
+ return NULL;
+
+ LIST_FOREACH(conditions, c, u->conditions)
+ if (c->trigger) {
+ if (c->result == CONDITION_SUCCEEDED)
+ has_succeeded_trigger = true;
+ else if (!failed_trigger)
+ failed_trigger = c;
+ } else if (c->result != CONDITION_SUCCEEDED)
+ return c;
+
+ return failed_trigger && !has_succeeded_trigger ? failed_trigger : NULL;
+}
+
+static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
+ [COLLECT_INACTIVE] = "inactive",
+ [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(collect_mode, CollectMode);
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other) {
+ Unit *i;
+
+ assert(u);
+
+ /* Checks if the unit has a dependency on 'other' with the specified dependency atom. If 'other' is
+ * NULL checks if the unit has *any* dependency of that atom. Returns 'other' if found (or if 'other'
+ * is NULL the first entry found), or NULL if not found. */
+
+ UNIT_FOREACH_DEPENDENCY(i, u, atom)
+ if (!other || other == i)
+ return i;
+
+ return NULL;
+}
+
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array) {
+ _cleanup_free_ Unit **array = NULL;
+ size_t n = 0;
+ Unit *other;
+
+ assert(u);
+ assert(ret_array);
+
+ /* Gets a list of units matching a specific atom as array. This is useful when iterating through
+ * dependencies while modifying them: the array is an "atomic snapshot" of sorts, that can be read
+ * while the dependency table is continuously updated. */
+
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ if (!GREEDY_REALLOC(array, n + 1))
+ return -ENOMEM;
+
+ array[n++] = other;
+ }
+
+ *ret_array = TAKE_PTR(array);
+
+ assert(n <= INT_MAX);
+ return (int) n;
+}
+
+int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret) {
+ _cleanup_set_free_ Set *units = NULL, *queue = NULL;
+ Unit *other;
+ int r;
+
+ assert(u);
+ assert(ret);
+
+ /* Similar to unit_get_dependency_array(), but also search the same dependency in other units. */
+
+ do {
+ UNIT_FOREACH_DEPENDENCY(other, u, atom) {
+ r = set_ensure_put(&units, NULL, other);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+ r = set_ensure_put(&queue, NULL, other);
+ if (r < 0)
+ return r;
+ }
+ } while ((u = set_steal_first(queue)));
+
+ *ret = TAKE_PTR(units);
+ return 0;
+}
+
+int unit_arm_timer(
+ Unit *u,
+ sd_event_source **source,
+ bool relative,
+ usec_t usec,
+ sd_event_time_handler_t handler) {
+
+ int r;
+
+ assert(u);
+ assert(source);
+ assert(handler);
+
+ if (*source) {
+ if (usec == USEC_INFINITY)
+ return sd_event_source_set_enabled(*source, SD_EVENT_OFF);
+
+ r = (relative ? sd_event_source_set_time_relative : sd_event_source_set_time)(*source, usec);
+ if (r < 0)
+ return r;
+
+ return sd_event_source_set_enabled(*source, SD_EVENT_ONESHOT);
+ }
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ r = (relative ? sd_event_add_time_relative : sd_event_add_time)(
+ u->manager->event,
+ source,
+ CLOCK_MONOTONIC,
+ usec, 0,
+ handler,
+ u);
+ if (r < 0)
+ return r;
+
+ const char *d = strjoina(unit_type_to_string(u->type), "-timer");
+ (void) sd_event_source_set_description(*source, d);
+
+ return 0;
+}
+
+static int unit_get_nice(Unit *u) {
+ ExecContext *ec;
+
+ ec = unit_get_exec_context(u);
+ return ec ? ec->nice : 0;
+}
+
+static uint64_t unit_get_cpu_weight(Unit *u) {
+ CGroupContext *cc;
+
+ cc = unit_get_cgroup_context(u);
+ return cc ? cgroup_context_cpu_weight(cc, manager_state(u->manager)) : CGROUP_WEIGHT_DEFAULT;
+}
+
+int unit_compare_priority(Unit *a, Unit *b) {
+ int ret;
+
+ ret = CMP(a->type, b->type);
+ if (ret != 0)
+ return -ret;
+
+ ret = CMP(unit_get_cpu_weight(a), unit_get_cpu_weight(b));
+ if (ret != 0)
+ return -ret;
+
+ ret = CMP(unit_get_nice(a), unit_get_nice(b));
+ if (ret != 0)
+ return ret;
+
+ return strcmp(a->id, b->id);
+}
+
+const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX] = {
+ [UNIT_PATH] = &activation_details_path_vtable,
+ [UNIT_TIMER] = &activation_details_timer_vtable,
+};
+
+ActivationDetails *activation_details_new(Unit *trigger_unit) {
+ _cleanup_free_ ActivationDetails *details = NULL;
+
+ assert(trigger_unit);
+ assert(trigger_unit->type != _UNIT_TYPE_INVALID);
+ assert(trigger_unit->id);
+
+ details = malloc0(activation_details_vtable[trigger_unit->type]->object_size);
+ if (!details)
+ return NULL;
+
+ *details = (ActivationDetails) {
+ .n_ref = 1,
+ .trigger_unit_type = trigger_unit->type,
+ };
+
+ details->trigger_unit_name = strdup(trigger_unit->id);
+ if (!details->trigger_unit_name)
+ return NULL;
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->init)
+ ACTIVATION_DETAILS_VTABLE(details)->init(details, trigger_unit);
+
+ return TAKE_PTR(details);
+}
+
+static ActivationDetails *activation_details_free(ActivationDetails *details) {
+ if (!details)
+ return NULL;
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->done)
+ ACTIVATION_DETAILS_VTABLE(details)->done(details);
+
+ free(details->trigger_unit_name);
+
+ return mfree(details);
+}
+
+void activation_details_serialize(ActivationDetails *details, FILE *f) {
+ if (!details || details->trigger_unit_type == _UNIT_TYPE_INVALID)
+ return;
+
+ (void) serialize_item(f, "activation-details-unit-type", unit_type_to_string(details->trigger_unit_type));
+ if (details->trigger_unit_name)
+ (void) serialize_item(f, "activation-details-unit-name", details->trigger_unit_name);
+ if (ACTIVATION_DETAILS_VTABLE(details)->serialize)
+ ACTIVATION_DETAILS_VTABLE(details)->serialize(details, f);
+}
+
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **details) {
+ int r;
+
+ assert(key);
+ assert(value);
+ assert(details);
+
+ if (!*details) {
+ UnitType t;
+
+ if (!streq(key, "activation-details-unit-type"))
+ return -EINVAL;
+
+ t = unit_type_from_string(value);
+ if (t < 0)
+ return t;
+
+ /* The activation details vtable has defined ops only for path and timer units */
+ if (!activation_details_vtable[t])
+ return -EINVAL;
+
+ *details = malloc0(activation_details_vtable[t]->object_size);
+ if (!*details)
+ return -ENOMEM;
+
+ **details = (ActivationDetails) {
+ .n_ref = 1,
+ .trigger_unit_type = t,
+ };
+
+ return 0;
+ }
+
+ if (streq(key, "activation-details-unit-name")) {
+ r = free_and_strdup(&(*details)->trigger_unit_name, value);
+ if (r < 0)
+ return r;
+
+ return 0;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(*details)->deserialize)
+ return ACTIVATION_DETAILS_VTABLE(*details)->deserialize(key, value, details);
+
+ return -EINVAL;
+}
+
+int activation_details_append_env(ActivationDetails *details, char ***strv) {
+ int r = 0;
+
+ assert(strv);
+
+ if (!details)
+ return 0;
+
+ if (!isempty(details->trigger_unit_name)) {
+ char *s = strjoin("TRIGGER_UNIT=", details->trigger_unit_name);
+ if (!s)
+ return -ENOMEM;
+
+ r = strv_consume(strv, TAKE_PTR(s));
+ if (r < 0)
+ return r;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->append_env) {
+ r = ACTIVATION_DETAILS_VTABLE(details)->append_env(details, strv);
+ if (r < 0)
+ return r;
+ }
+
+ return r + !isempty(details->trigger_unit_name); /* Return the number of variables added to the env block */
+}
+
+int activation_details_append_pair(ActivationDetails *details, char ***strv) {
+ int r = 0;
+
+ assert(strv);
+
+ if (!details)
+ return 0;
+
+ if (!isempty(details->trigger_unit_name)) {
+ r = strv_extend(strv, "trigger_unit");
+ if (r < 0)
+ return r;
+
+ r = strv_extend(strv, details->trigger_unit_name);
+ if (r < 0)
+ return r;
+ }
+
+ if (ACTIVATION_DETAILS_VTABLE(details)->append_pair) {
+ r = ACTIVATION_DETAILS_VTABLE(details)->append_pair(details, strv);
+ if (r < 0)
+ return r;
+ }
+
+ return r + !isempty(details->trigger_unit_name); /* Return the number of pairs added to the strv */
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(ActivationDetails, activation_details, activation_details_free);
diff --git a/src/core/unit.h b/src/core/unit.h
new file mode 100644
index 0000000..60bc2e3
--- /dev/null
+++ b/src/core/unit.h
@@ -0,0 +1,1249 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+
+#include "bpf-program.h"
+#include "cgroup.h"
+#include "condition.h"
+#include "emergency-action.h"
+#include "install.h"
+#include "list.h"
+#include "pidref.h"
+#include "set.h"
+#include "show-status.h"
+#include "unit-file.h"
+
+typedef struct UnitRef UnitRef;
+
+typedef enum KillOperation {
+ KILL_TERMINATE,
+ KILL_TERMINATE_AND_LOG,
+ KILL_RESTART,
+ KILL_KILL,
+ KILL_WATCHDOG,
+ _KILL_OPERATION_MAX,
+ _KILL_OPERATION_INVALID = -EINVAL,
+} KillOperation;
+
+typedef enum CollectMode {
+ COLLECT_INACTIVE,
+ COLLECT_INACTIVE_OR_FAILED,
+ _COLLECT_MODE_MAX,
+ _COLLECT_MODE_INVALID = -EINVAL,
+} CollectMode;
+
+static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
+ return IN_SET(t, UNIT_ACTIVE, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_ACTIVE_OR_ACTIVATING(UnitActiveState t) {
+ return IN_SET(t, UNIT_ACTIVE, UNIT_ACTIVATING, UNIT_RELOADING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_DEACTIVATING(UnitActiveState t) {
+ return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED, UNIT_DEACTIVATING);
+}
+
+static inline bool UNIT_IS_INACTIVE_OR_FAILED(UnitActiveState t) {
+ return IN_SET(t, UNIT_INACTIVE, UNIT_FAILED);
+}
+
+static inline bool UNIT_IS_LOAD_COMPLETE(UnitLoadState t) {
+ return t >= 0 && t < _UNIT_LOAD_STATE_MAX && t != UNIT_STUB && t != UNIT_MERGED;
+}
+
+/* Stores the 'reason' a dependency was created as a bit mask, i.e. due to which configuration source it came to be. We
+ * use this so that we can selectively flush out parts of dependencies again. Note that the same dependency might be
+ * created as a result of multiple "reasons", hence the bitmask. */
+typedef enum UnitDependencyMask {
+ /* Configured directly by the unit file, .wants/.requires symlink or drop-in, or as an immediate result of a
+ * non-dependency option configured that way. */
+ UNIT_DEPENDENCY_FILE = 1 << 0,
+
+ /* As unconditional implicit dependency (not affected by unit configuration — except by the unit name and
+ * type) */
+ UNIT_DEPENDENCY_IMPLICIT = 1 << 1,
+
+ /* A dependency effected by DefaultDependencies=yes. Note that dependencies marked this way are conceptually
+ * just a subset of UNIT_DEPENDENCY_FILE, as DefaultDependencies= is itself a unit file setting that can only
+ * be set in unit files. We make this two separate bits only to help debugging how dependencies came to be. */
+ UNIT_DEPENDENCY_DEFAULT = 1 << 2,
+
+ /* A dependency created from udev rules */
+ UNIT_DEPENDENCY_UDEV = 1 << 3,
+
+ /* A dependency created because of some unit's RequiresMountsFor= setting */
+ UNIT_DEPENDENCY_PATH = 1 << 4,
+
+ /* A dependency initially configured from the mount unit file however the dependency will be updated
+ * from /proc/self/mountinfo as soon as the kernel will make the entry for that mount available in
+ * the /proc file */
+ UNIT_DEPENDENCY_MOUNT_FILE = 1 << 5,
+
+ /* A dependency created or updated because of data read from /proc/self/mountinfo */
+ UNIT_DEPENDENCY_MOUNTINFO = 1 << 6,
+
+ /* A dependency created because of data read from /proc/swaps and no other configuration source */
+ UNIT_DEPENDENCY_PROC_SWAP = 1 << 7,
+
+ /* A dependency for units in slices assigned by directly setting Slice= */
+ UNIT_DEPENDENCY_SLICE_PROPERTY = 1 << 8,
+
+ _UNIT_DEPENDENCY_MASK_FULL = (1 << 9) - 1,
+} UnitDependencyMask;
+
+/* The Unit's dependencies[] hashmaps use this structure as value. It has the same size as a void pointer, and thus can
+ * be stored directly as hashmap value, without any indirection. Note that this stores two masks, as both the origin
+ * and the destination of a dependency might have created it. */
+typedef union UnitDependencyInfo {
+ void *data;
+ struct {
+ UnitDependencyMask origin_mask:16;
+ UnitDependencyMask destination_mask:16;
+ } _packed_;
+} UnitDependencyInfo;
+
+/* Store information about why a unit was activated.
+ * We start with trigger units (.path/.timer), eventually it will be expanded to include more metadata. */
+typedef struct ActivationDetails {
+ unsigned n_ref;
+ UnitType trigger_unit_type;
+ char *trigger_unit_name;
+} ActivationDetails;
+
+/* For casting an activation event into the various unit-specific types */
+#define DEFINE_ACTIVATION_DETAILS_CAST(UPPERCASE, MixedCase, UNIT_TYPE) \
+ static inline MixedCase* UPPERCASE(ActivationDetails *a) { \
+ if (_unlikely_(!a || a->trigger_unit_type != UNIT_##UNIT_TYPE)) \
+ return NULL; \
+ \
+ return (MixedCase*) a; \
+ }
+
+/* For casting the various unit types into a unit */
+#define ACTIVATION_DETAILS(u) \
+ ({ \
+ typeof(u) _u_ = (u); \
+ ActivationDetails *_w_ = _u_ ? &(_u_)->meta : NULL; \
+ _w_; \
+ })
+
+ActivationDetails *activation_details_new(Unit *trigger_unit);
+ActivationDetails *activation_details_ref(ActivationDetails *p);
+ActivationDetails *activation_details_unref(ActivationDetails *p);
+void activation_details_serialize(ActivationDetails *p, FILE *f);
+int activation_details_deserialize(const char *key, const char *value, ActivationDetails **info);
+int activation_details_append_env(ActivationDetails *info, char ***strv);
+int activation_details_append_pair(ActivationDetails *info, char ***strv);
+DEFINE_TRIVIAL_CLEANUP_FUNC(ActivationDetails*, activation_details_unref);
+
+typedef struct ActivationDetailsVTable {
+ /* How much memory does an object of this activation type need */
+ size_t object_size;
+
+ /* This should reset all type-specific variables. This should not allocate memory, and is called
+ * with zero-initialized data. It should hence only initialize variables that need to be set != 0. */
+ void (*init)(ActivationDetails *info, Unit *trigger_unit);
+
+ /* This should free all type-specific variables. It should be idempotent. */
+ void (*done)(ActivationDetails *info);
+
+ /* This should serialize all type-specific variables. */
+ void (*serialize)(ActivationDetails *info, FILE *f);
+
+ /* This should deserialize all type-specific variables, one at a time. */
+ int (*deserialize)(const char *key, const char *value, ActivationDetails **info);
+
+ /* This should format the type-specific variables for the env block of the spawned service,
+ * and return the number of added items. */
+ int (*append_env)(ActivationDetails *info, char ***strv);
+
+ /* This should append type-specific variables as key/value pairs for the D-Bus property of the job,
+ * and return the number of added pairs. */
+ int (*append_pair)(ActivationDetails *info, char ***strv);
+} ActivationDetailsVTable;
+
+extern const ActivationDetailsVTable * const activation_details_vtable[_UNIT_TYPE_MAX];
+
+static inline const ActivationDetailsVTable* ACTIVATION_DETAILS_VTABLE(const ActivationDetails *a) {
+ assert(a);
+ assert(a->trigger_unit_type < _UNIT_TYPE_MAX);
+
+ return activation_details_vtable[a->trigger_unit_type];
+}
+
+/* Newer LLVM versions don't like implicit casts from large pointer types to smaller enums, hence let's add
+ * explicit type-safe helpers for that. */
+static inline UnitDependency UNIT_DEPENDENCY_FROM_PTR(const void *p) {
+ return PTR_TO_INT(p);
+}
+
+static inline void* UNIT_DEPENDENCY_TO_PTR(UnitDependency d) {
+ return INT_TO_PTR(d);
+}
+
+#include "job.h"
+
+struct UnitRef {
+ /* Keeps tracks of references to a unit. This is useful so
+ * that we can merge two units if necessary and correct all
+ * references to them */
+
+ Unit *source, *target;
+ LIST_FIELDS(UnitRef, refs_by_target);
+};
+
+typedef struct Unit {
+ Manager *manager;
+
+ UnitType type;
+ UnitLoadState load_state;
+ Unit *merged_into;
+
+ char *id; /* The one special name that we use for identification */
+ char *instance;
+
+ Set *aliases; /* All the other names. */
+
+ /* For each dependency type we can look up another Hashmap with this, whose key is a Unit* object,
+ * and whose value encodes why the dependency exists, using the UnitDependencyInfo type. i.e. a
+ * Hashmap(UnitDependency → Hashmap(Unit* → UnitDependencyInfo)) */
+ Hashmap *dependencies;
+
+ /* Similar, for RequiresMountsFor= path dependencies. The key is the path, the value the
+ * UnitDependencyInfo type */
+ Hashmap *requires_mounts_for;
+
+ char *description;
+ char **documentation;
+
+ /* The SELinux context used for checking access to this unit read off the unit file at load time (do
+ * not confuse with the selinux_context field in ExecContext which is the SELinux context we'll set
+ * for processes) */
+ char *access_selinux_context;
+
+ char *fragment_path; /* if loaded from a config file this is the primary path to it */
+ char *source_path; /* if converted, the source file */
+ char **dropin_paths;
+
+ usec_t fragment_not_found_timestamp_hash;
+ usec_t fragment_mtime;
+ usec_t source_mtime;
+ usec_t dropin_mtime;
+
+ /* If this is a transient unit we are currently writing, this is where we are writing it to */
+ FILE *transient_file;
+
+ /* Freezer state */
+ sd_bus_message *pending_freezer_invocation;
+ FreezerState freezer_state;
+
+ /* Job timeout and action to take */
+ EmergencyAction job_timeout_action;
+ usec_t job_timeout;
+ usec_t job_running_timeout;
+ char *job_timeout_reboot_arg;
+
+ /* If there is something to do with this unit, then this is the installed job for it */
+ Job *job;
+
+ /* JOB_NOP jobs are special and can be installed without disturbing the real job. */
+ Job *nop_job;
+
+ /* The slot used for watching NameOwnerChanged signals */
+ sd_bus_slot *match_bus_slot;
+ sd_bus_slot *get_name_owner_slot;
+
+ /* References to this unit from clients */
+ sd_bus_track *bus_track;
+ char **deserialized_refs;
+
+ /* References to this */
+ LIST_HEAD(UnitRef, refs_by_target);
+
+ /* Conditions to check */
+ LIST_HEAD(Condition, conditions);
+ LIST_HEAD(Condition, asserts);
+
+ dual_timestamp condition_timestamp;
+ dual_timestamp assert_timestamp;
+
+ /* Updated whenever the low-level state changes */
+ dual_timestamp state_change_timestamp;
+
+ /* Updated whenever the (high-level) active state enters or leaves the active or inactive states */
+ dual_timestamp inactive_exit_timestamp;
+ dual_timestamp active_enter_timestamp;
+ dual_timestamp active_exit_timestamp;
+ dual_timestamp inactive_enter_timestamp;
+
+ /* Per type list */
+ LIST_FIELDS(Unit, units_by_type);
+
+ /* Load queue */
+ LIST_FIELDS(Unit, load_queue);
+
+ /* D-Bus queue */
+ LIST_FIELDS(Unit, dbus_queue);
+
+ /* Cleanup queue */
+ LIST_FIELDS(Unit, cleanup_queue);
+
+ /* GC queue */
+ LIST_FIELDS(Unit, gc_queue);
+
+ /* CGroup realize members queue */
+ LIST_FIELDS(Unit, cgroup_realize_queue);
+
+ /* cgroup empty queue */
+ LIST_FIELDS(Unit, cgroup_empty_queue);
+
+ /* cgroup OOM queue */
+ LIST_FIELDS(Unit, cgroup_oom_queue);
+
+ /* Target dependencies queue */
+ LIST_FIELDS(Unit, target_deps_queue);
+
+ /* Queue of units with StopWhenUnneeded= set that shall be checked for clean-up. */
+ LIST_FIELDS(Unit, stop_when_unneeded_queue);
+
+ /* Queue of units that have an Uphold= dependency from some other unit, and should be checked for starting */
+ LIST_FIELDS(Unit, start_when_upheld_queue);
+
+ /* Queue of units that have a BindTo= dependency on some other unit, and should possibly be shut down */
+ LIST_FIELDS(Unit, stop_when_bound_queue);
+
+ /* Queue of units that should be checked if they can release resources now */
+ LIST_FIELDS(Unit, release_resources_queue);
+
+ /* PIDs we keep an eye on. Note that a unit might have many more, but these are the ones we care
+ * enough about to process SIGCHLD for */
+ Set *pids; /* → PidRef* */
+
+ /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event
+ * multiple times on the same unit. */
+ unsigned sigchldgen;
+ unsigned notifygen;
+
+ /* Used during GC sweeps */
+ unsigned gc_marker;
+
+ /* Error code when we didn't manage to load the unit (negative) */
+ int load_error;
+
+ /* Put a ratelimit on unit starting */
+ RateLimit start_ratelimit;
+ EmergencyAction start_limit_action;
+
+ /* The unit has been marked for reload, restart, etc. Stored as 1u << marker1 | 1u << marker2. */
+ unsigned markers;
+
+ /* What to do on failure or success */
+ EmergencyAction success_action, failure_action;
+ int success_action_exit_status, failure_action_exit_status;
+ char *reboot_arg;
+
+ /* Make sure we never enter endless loops with the StopWhenUnneeded=, BindsTo=, Uphold= logic */
+ RateLimit auto_start_stop_ratelimit;
+ sd_event_source *auto_start_stop_event_source;
+
+ /* Reference to a specific UID/GID */
+ uid_t ref_uid;
+ gid_t ref_gid;
+
+ /* Cached unit file state and preset */
+ UnitFileState unit_file_state;
+ PresetAction unit_file_preset;
+
+ /* Where the cpu.stat or cpuacct.usage was at the time the unit was started */
+ nsec_t cpu_usage_base;
+ nsec_t cpu_usage_last; /* the most recently read value */
+
+ /* Most recently read value of memory accounting metrics */
+ uint64_t memory_accounting_last[_CGROUP_MEMORY_ACCOUNTING_METRIC_CACHED_LAST + 1];
+
+ /* The current counter of OOM kills initiated by systemd-oomd */
+ uint64_t managed_oom_kill_last;
+
+ /* The current counter of the oom_kill field in the memory.events cgroup attribute */
+ uint64_t oom_kill_last;
+
+ /* Where the io.stat data was at the time the unit was started */
+ uint64_t io_accounting_base[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
+ uint64_t io_accounting_last[_CGROUP_IO_ACCOUNTING_METRIC_MAX]; /* the most recently read value */
+
+ /* Counterparts in the cgroup filesystem */
+ char *cgroup_path;
+ uint64_t cgroup_id;
+ CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
+ CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
+ CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */
+ CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
+
+ /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
+ int cgroup_control_inotify_wd;
+ int cgroup_memory_inotify_wd;
+
+ /* Device Controller BPF program */
+ BPFProgram *bpf_device_control_installed;
+
+ /* IP BPF Firewalling/accounting */
+ int ip_accounting_ingress_map_fd;
+ int ip_accounting_egress_map_fd;
+ uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
+
+ int ipv4_allow_map_fd;
+ int ipv6_allow_map_fd;
+ int ipv4_deny_map_fd;
+ int ipv6_deny_map_fd;
+ BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+ BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
+
+ Set *ip_bpf_custom_ingress;
+ Set *ip_bpf_custom_ingress_installed;
+ Set *ip_bpf_custom_egress;
+ Set *ip_bpf_custom_egress_installed;
+
+ /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd,
+ * attached to unit cgroup by provided program fd and attach type. */
+ Hashmap *bpf_foreign_by_key;
+
+ FDSet *initial_socket_bind_link_fds;
+#if BPF_FRAMEWORK
+ /* BPF links to BPF programs attached to cgroup/bind{4|6} hooks and
+ * responsible for allowing or denying a unit to bind(2) to a socket
+ * address. */
+ struct bpf_link *ipv4_socket_bind_link;
+ struct bpf_link *ipv6_socket_bind_link;
+#endif
+
+ FDSet *initial_restric_ifaces_link_fds;
+#if BPF_FRAMEWORK
+ struct bpf_link *restrict_ifaces_ingress_bpf_link;
+ struct bpf_link *restrict_ifaces_egress_bpf_link;
+#endif
+
+ /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new
+ * ones which might have appeared. */
+ sd_event_source *rewatch_pids_event_source;
+
+ /* How to start OnSuccess=/OnFailure= units */
+ JobMode on_success_job_mode;
+ JobMode on_failure_job_mode;
+
+ /* If the job had a specific trigger that needs to be advertised (eg: a path unit), store it. */
+ ActivationDetails *activation_details;
+
+ /* Tweaking the GC logic */
+ CollectMode collect_mode;
+
+ /* The current invocation ID */
+ sd_id128_t invocation_id;
+ char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */
+
+ /* Garbage collect us we nobody wants or requires us anymore */
+ bool stop_when_unneeded;
+
+ /* Create default dependencies */
+ bool default_dependencies;
+
+ /* Configure so that the unit survives a system transition without stopping/starting. */
+ bool survive_final_kill_signal;
+
+ /* Refuse manual starting, allow starting only indirectly via dependency. */
+ bool refuse_manual_start;
+
+ /* Don't allow the user to stop this unit manually, allow stopping only indirectly via dependency. */
+ bool refuse_manual_stop;
+
+ /* Allow isolation requests */
+ bool allow_isolate;
+
+ /* Ignore this unit when isolating */
+ bool ignore_on_isolate;
+
+ /* Did the last condition check succeed? */
+ bool condition_result;
+ bool assert_result;
+
+ /* Is this a transient unit? */
+ bool transient;
+
+ /* Is this a unit that is always running and cannot be stopped? */
+ bool perpetual;
+
+ /* Booleans indicating membership of this unit in the various queues */
+ bool in_load_queue:1;
+ bool in_dbus_queue:1;
+ bool in_cleanup_queue:1;
+ bool in_gc_queue:1;
+ bool in_cgroup_realize_queue:1;
+ bool in_cgroup_empty_queue:1;
+ bool in_cgroup_oom_queue:1;
+ bool in_target_deps_queue:1;
+ bool in_stop_when_unneeded_queue:1;
+ bool in_start_when_upheld_queue:1;
+ bool in_stop_when_bound_queue:1;
+ bool in_release_resources_queue:1;
+
+ bool sent_dbus_new_signal:1;
+
+ bool job_running_timeout_set:1;
+
+ bool in_audit:1;
+ bool on_console:1;
+
+ bool cgroup_realized:1;
+ bool cgroup_members_mask_valid:1;
+
+ /* Reset cgroup accounting next time we fork something off */
+ bool reset_accounting:1;
+
+ bool start_limit_hit:1;
+
+ /* Did we already invoke unit_coldplug() for this unit? */
+ bool coldplugged:1;
+
+ /* For transient units: whether to add a bus track reference after creating the unit */
+ bool bus_track_add:1;
+
+ /* Remember which unit state files we created */
+ bool exported_invocation_id:1;
+ bool exported_log_level_max:1;
+ bool exported_log_extra_fields:1;
+ bool exported_log_ratelimit_interval:1;
+ bool exported_log_ratelimit_burst:1;
+
+ /* Whether we warned about clamping the CPU quota period */
+ bool warned_clamping_cpu_quota_period:1;
+
+ /* When writing transient unit files, stores which section we stored last. If < 0, we didn't write any yet. If
+ * == 0 we are in the [Unit] section, if > 0 we are in the unit type-specific section. */
+ signed int last_section_private:2;
+} Unit;
+
+typedef struct UnitStatusMessageFormats {
+ const char *starting_stopping[2];
+ const char *finished_start_job[_JOB_RESULT_MAX];
+ const char *finished_stop_job[_JOB_RESULT_MAX];
+ /* If this entry is present, it'll be called to provide a context-dependent format string,
+ * or NULL to fall back to finished_{start,stop}_job; if those are NULL too, fall back to generic. */
+ const char *(*finished_job)(Unit *u, JobType t, JobResult result);
+} UnitStatusMessageFormats;
+
+/* Flags used when writing drop-in files or transient unit files */
+typedef enum UnitWriteFlags {
+ /* Write a runtime unit file or drop-in (i.e. one below /run) */
+ UNIT_RUNTIME = 1 << 0,
+
+ /* Write a persistent drop-in (i.e. one below /etc) */
+ UNIT_PERSISTENT = 1 << 1,
+
+ /* Place this item in the per-unit-type private section, instead of [Unit] */
+ UNIT_PRIVATE = 1 << 2,
+
+ /* Apply specifier escaping */
+ UNIT_ESCAPE_SPECIFIERS = 1 << 3,
+
+ /* Escape elements of ExecStart= syntax, incl. prevention of variable expansion */
+ UNIT_ESCAPE_EXEC_SYNTAX_ENV = 1 << 4,
+
+ /* Escape elements of ExecStart=: syntax (no variable expansion) */
+ UNIT_ESCAPE_EXEC_SYNTAX = 1 << 5,
+
+ /* Apply C escaping before writing */
+ UNIT_ESCAPE_C = 1 << 6,
+} UnitWriteFlags;
+
+/* Returns true if neither persistent, nor runtime storage is requested, i.e. this is a check invocation only */
+static inline bool UNIT_WRITE_FLAGS_NOOP(UnitWriteFlags flags) {
+ return (flags & (UNIT_RUNTIME|UNIT_PERSISTENT)) == 0;
+}
+
+#include "kill.h"
+
+typedef struct UnitVTable {
+ /* How much memory does an object of this unit type need */
+ size_t object_size;
+
+ /* If greater than 0, the offset into the object where
+ * ExecContext is found, if the unit type has that */
+ size_t exec_context_offset;
+
+ /* If greater than 0, the offset into the object where
+ * CGroupContext is found, if the unit type has that */
+ size_t cgroup_context_offset;
+
+ /* If greater than 0, the offset into the object where
+ * KillContext is found, if the unit type has that */
+ size_t kill_context_offset;
+
+ /* If greater than 0, the offset into the object where the
+ * pointer to ExecSharedRuntime is found, if the unit type has
+ * that */
+ size_t exec_runtime_offset;
+
+ /* The name of the configuration file section with the private settings of this unit */
+ const char *private_section;
+
+ /* Config file sections this unit type understands, separated
+ * by NUL chars */
+ const char *sections;
+
+ /* This should reset all type-specific variables. This should
+ * not allocate memory, and is called with zero-initialized
+ * data. It should hence only initialize variables that need
+ * to be set != 0. */
+ void (*init)(Unit *u);
+
+ /* This should free all type-specific variables. It should be
+ * idempotent. */
+ void (*done)(Unit *u);
+
+ /* Actually load data from disk. This may fail, and should set
+ * load_state to UNIT_LOADED, UNIT_MERGED or leave it at
+ * UNIT_STUB if no configuration could be found. */
+ int (*load)(Unit *u);
+
+ /* During deserialization we only record the intended state to return to. With coldplug() we actually put the
+ * deserialized state in effect. This is where unit_notify() should be called to start things up. Note that
+ * this callback is invoked *before* we leave the reloading state of the manager, i.e. *before* we consider the
+ * reloading to be complete. Thus, this callback should just restore the exact same state for any unit that was
+ * in effect before the reload, i.e. units should not catch up with changes happened during the reload. That's
+ * what catchup() below is for. */
+ int (*coldplug)(Unit *u);
+
+ /* This is called shortly after all units' coldplug() call was invoked, and *after* the manager left the
+ * reloading state. It's supposed to catch up with state changes due to external events we missed so far (for
+ * example because they took place while we were reloading/reexecing) */
+ void (*catchup)(Unit *u);
+
+ void (*dump)(Unit *u, FILE *f, const char *prefix);
+
+ int (*start)(Unit *u);
+ int (*stop)(Unit *u);
+ int (*reload)(Unit *u);
+
+ /* Clear out the various runtime/state/cache/logs/configuration data */
+ int (*clean)(Unit *u, ExecCleanMask m);
+
+ /* Freeze the unit */
+ int (*freeze)(Unit *u);
+ int (*thaw)(Unit *u);
+ bool (*can_freeze)(Unit *u);
+
+ /* Return which kind of data can be cleaned */
+ int (*can_clean)(Unit *u, ExecCleanMask *ret);
+
+ bool (*can_reload)(Unit *u);
+
+ /* Serialize state and file descriptors that should be carried over into the new
+ * instance after reexecution. */
+ int (*serialize)(Unit *u, FILE *f, FDSet *fds);
+
+ /* Restore one item from the serialization */
+ int (*deserialize_item)(Unit *u, const char *key, const char *data, FDSet *fds);
+
+ /* Try to match up fds with what we need for this unit */
+ void (*distribute_fds)(Unit *u, FDSet *fds);
+
+ /* Boils down the more complex internal state of this unit to
+ * a simpler one that the engine can understand */
+ UnitActiveState (*active_state)(Unit *u);
+
+ /* Returns the substate specific to this unit type as
+ * string. This is purely information so that we can give the
+ * user a more fine grained explanation in which actual state a
+ * unit is in. */
+ const char* (*sub_state_to_string)(Unit *u);
+
+ /* Additionally to UnitActiveState determine whether unit is to be restarted. */
+ bool (*will_restart)(Unit *u);
+
+ /* Return false when there is a reason to prevent this unit from being gc'ed
+ * even though nothing references it and it isn't active in any way. */
+ bool (*may_gc)(Unit *u);
+
+ /* Return true when the unit is not controlled by the manager (e.g. extrinsic mounts). */
+ bool (*is_extrinsic)(Unit *u);
+
+ /* When the unit is not running and no job for it queued we shall release its runtime resources */
+ void (*release_resources)(Unit *u);
+
+ /* Invoked on every child that died */
+ void (*sigchld_event)(Unit *u, pid_t pid, int code, int status);
+
+ /* Reset failed state if we are in failed state */
+ void (*reset_failed)(Unit *u);
+
+ /* Called whenever any of the cgroups this unit watches for ran empty */
+ void (*notify_cgroup_empty)(Unit *u);
+
+ /* Called whenever an OOM kill event on this unit was seen */
+ void (*notify_cgroup_oom)(Unit *u, bool managed_oom);
+
+ /* Called whenever a process of this unit sends us a message */
+ void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds);
+
+ /* Called whenever a name this Unit registered for comes or goes away. */
+ void (*bus_name_owner_change)(Unit *u, const char *new_owner);
+
+ /* Called for each property that is being set */
+ int (*bus_set_property)(Unit *u, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+
+ /* Called after at least one property got changed to apply the necessary change */
+ int (*bus_commit_properties)(Unit *u);
+
+ /* Return the unit this unit is following */
+ Unit *(*following)(Unit *u);
+
+ /* Return the set of units that are following each other */
+ int (*following_set)(Unit *u, Set **s);
+
+ /* Invoked each time a unit this unit is triggering changes
+ * state or gains/loses a job */
+ void (*trigger_notify)(Unit *u, Unit *trigger);
+
+ /* Called whenever CLOCK_REALTIME made a jump */
+ void (*time_change)(Unit *u);
+
+ /* Called whenever /etc/localtime was modified */
+ void (*timezone_change)(Unit *u);
+
+ /* Returns the next timeout of a unit */
+ int (*get_timeout)(Unit *u, usec_t *timeout);
+
+ /* Returns the start timeout of a unit */
+ usec_t (*get_timeout_start_usec)(Unit *u);
+
+ /* Returns the main PID if there is any defined, or 0. */
+ PidRef* (*main_pid)(Unit *u);
+
+ /* Returns the control PID if there is any defined, or 0. */
+ PidRef* (*control_pid)(Unit *u);
+
+ /* Returns true if the unit currently needs access to the console */
+ bool (*needs_console)(Unit *u);
+
+ /* Returns the exit status to propagate in case of FailureAction=exit/SuccessAction=exit; usually returns the
+ * exit code of the "main" process of the service or similar. */
+ int (*exit_status)(Unit *u);
+
+ /* Return a copy of the status string pointer. */
+ const char* (*status_text)(Unit *u);
+
+ /* Like the enumerate() callback further down, but only enumerates the perpetual units, i.e. all units that
+ * unconditionally exist and are always active. The main reason to keep both enumeration functions separate is
+ * philosophical: the state of perpetual units should be put in place by coldplug(), while the state of those
+ * discovered through regular enumeration should be put in place by catchup(), see below. */
+ void (*enumerate_perpetual)(Manager *m);
+
+ /* This is called for each unit type and should be used to enumerate units already existing in the system
+ * internally and load them. However, everything that is loaded here should still stay in inactive state. It is
+ * the job of the catchup() call above to put the units into the discovered state. */
+ void (*enumerate)(Manager *m);
+
+ /* Type specific cleanups. */
+ void (*shutdown)(Manager *m);
+
+ /* If this function is set and returns false all jobs for units
+ * of this type will immediately fail. */
+ bool (*supported)(void);
+
+ /* If this function is set, it's invoked first as part of starting a unit to allow start rate
+ * limiting checks to occur before we do anything else. */
+ int (*can_start)(Unit *u);
+
+ /* Returns > 0 if the whole subsystem is ratelimited, and new start operations should not be started
+ * for this unit type right now. */
+ int (*subsystem_ratelimited)(Manager *m);
+
+ /* The strings to print in status messages */
+ UnitStatusMessageFormats status_message_formats;
+
+ /* True if transient units of this type are OK */
+ bool can_transient;
+
+ /* True if cgroup delegation is permissible */
+ bool can_delegate;
+
+ /* True if the unit type triggers other units, i.e. can have a UNIT_TRIGGERS dependency */
+ bool can_trigger;
+
+ /* True if the unit type knows a failure state, and thus can be source of an OnFailure= dependency */
+ bool can_fail;
+
+ /* True if units of this type shall be startable only once and then never again */
+ bool once_only;
+
+ /* Do not serialize this unit when preparing for root switch */
+ bool exclude_from_switch_root_serialization;
+
+ /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */
+ bool gc_jobs;
+
+ /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroups */
+ bool can_set_managed_oom;
+
+ /* If true, we'll notify plymouth about this unit */
+ bool notify_plymouth;
+
+ /* The audit events to generate on start + stop (or 0 if none shall be generated) */
+ int audit_start_message_type;
+ int audit_stop_message_type;
+} UnitVTable;
+
+extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX];
+
+static inline const UnitVTable* UNIT_VTABLE(const Unit *u) {
+ return unit_vtable[u->type];
+}
+
+/* For casting a unit into the various unit types */
+#define DEFINE_CAST(UPPERCASE, MixedCase) \
+ static inline MixedCase* UPPERCASE(Unit *u) { \
+ if (_unlikely_(!u || u->type != UNIT_##UPPERCASE)) \
+ return NULL; \
+ \
+ return (MixedCase*) u; \
+ }
+
+/* For casting the various unit types into a unit */
+#define UNIT(u) \
+ ({ \
+ typeof(u) _u_ = (u); \
+ Unit *_w_ = _u_ ? &(_u_)->meta : NULL; \
+ _w_; \
+ })
+
+#define UNIT_HAS_EXEC_CONTEXT(u) (UNIT_VTABLE(u)->exec_context_offset > 0)
+#define UNIT_HAS_CGROUP_CONTEXT(u) (UNIT_VTABLE(u)->cgroup_context_offset > 0)
+#define UNIT_HAS_KILL_CONTEXT(u) (UNIT_VTABLE(u)->kill_context_offset > 0)
+
+Unit* unit_has_dependency(const Unit *u, UnitDependencyAtom atom, Unit *other);
+int unit_get_dependency_array(const Unit *u, UnitDependencyAtom atom, Unit ***ret_array);
+int unit_get_transitive_dependency_set(Unit *u, UnitDependencyAtom atom, Set **ret);
+
+static inline Hashmap* unit_get_dependencies(Unit *u, UnitDependency d) {
+ return hashmap_get(u->dependencies, UNIT_DEPENDENCY_TO_PTR(d));
+}
+
+static inline Unit* UNIT_TRIGGER(Unit *u) {
+ return unit_has_dependency(u, UNIT_ATOM_TRIGGERS, NULL);
+}
+
+static inline Unit* UNIT_GET_SLICE(const Unit *u) {
+ return unit_has_dependency(u, UNIT_ATOM_IN_SLICE, NULL);
+}
+
+Unit* unit_new(Manager *m, size_t size);
+Unit* unit_free(Unit *u);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Unit *, unit_free);
+
+int unit_new_for_name(Manager *m, size_t size, const char *name, Unit **ret);
+int unit_add_name(Unit *u, const char *name);
+
+int unit_add_dependency(Unit *u, UnitDependency d, Unit *other, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies(Unit *u, UnitDependency d, UnitDependency e, Unit *other, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_dependency_by_name(Unit *u, UnitDependency d, const char *name, bool add_reference, UnitDependencyMask mask);
+int unit_add_two_dependencies_by_name(Unit *u, UnitDependency d, UnitDependency e, const char *name, bool add_reference, UnitDependencyMask mask);
+
+int unit_add_exec_dependencies(Unit *u, ExecContext *c);
+
+int unit_choose_id(Unit *u, const char *name);
+int unit_set_description(Unit *u, const char *description);
+
+void unit_release_resources(Unit *u);
+
+bool unit_may_gc(Unit *u);
+
+static inline bool unit_is_extrinsic(Unit *u) {
+ return u->perpetual ||
+ (UNIT_VTABLE(u)->is_extrinsic && UNIT_VTABLE(u)->is_extrinsic(u));
+}
+
+static inline const char* unit_status_text(Unit *u) {
+ if (u && UNIT_VTABLE(u)->status_text)
+ return UNIT_VTABLE(u)->status_text(u);
+ return NULL;
+}
+
+void unit_add_to_load_queue(Unit *u);
+void unit_add_to_dbus_queue(Unit *u);
+void unit_add_to_cleanup_queue(Unit *u);
+void unit_add_to_gc_queue(Unit *u);
+void unit_add_to_target_deps_queue(Unit *u);
+void unit_submit_to_stop_when_unneeded_queue(Unit *u);
+void unit_submit_to_start_when_upheld_queue(Unit *u);
+void unit_submit_to_stop_when_bound_queue(Unit *u);
+void unit_submit_to_release_resources_queue(Unit *u);
+
+int unit_merge(Unit *u, Unit *other);
+int unit_merge_by_name(Unit *u, const char *other);
+
+Unit *unit_follow_merge(Unit *u) _pure_;
+
+int unit_load_fragment_and_dropin(Unit *u, bool fragment_required);
+int unit_load(Unit *unit);
+
+int unit_set_slice(Unit *u, Unit *slice);
+int unit_set_default_slice(Unit *u);
+
+const char *unit_description(Unit *u) _pure_;
+const char *unit_status_string(Unit *u, char **combined);
+
+bool unit_has_name(const Unit *u, const char *name);
+
+UnitActiveState unit_active_state(Unit *u);
+FreezerState unit_freezer_state(Unit *u);
+int unit_freezer_state_kernel(Unit *u, FreezerState *ret);
+
+const char* unit_sub_state_to_string(Unit *u);
+
+bool unit_can_reload(Unit *u) _pure_;
+bool unit_can_start(Unit *u) _pure_;
+bool unit_can_stop(Unit *u) _pure_;
+bool unit_can_isolate(Unit *u) _pure_;
+
+int unit_start(Unit *u, ActivationDetails *details);
+int unit_stop(Unit *u);
+int unit_reload(Unit *u);
+
+int unit_kill(Unit *u, KillWho w, int signo, int code, int value, sd_bus_error *error);
+
+void unit_notify_cgroup_oom(Unit *u, bool managed_oom);
+
+void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success);
+
+int unit_watch_pidref(Unit *u, PidRef *pid, bool exclusive);
+int unit_watch_pid(Unit *u, pid_t pid, bool exclusive);
+void unit_unwatch_pidref(Unit *u, PidRef *pid);
+void unit_unwatch_pid(Unit *u, pid_t pid);
+void unit_unwatch_all_pids(Unit *u);
+
+int unit_enqueue_rewatch_pids(Unit *u);
+void unit_dequeue_rewatch_pids(Unit *u);
+
+int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name);
+int unit_watch_bus_name(Unit *u, const char *name);
+void unit_unwatch_bus_name(Unit *u, const char *name);
+
+bool unit_job_is_applicable(Unit *u, JobType j);
+
+int set_unit_path(const char *p);
+
+char *unit_dbus_path(Unit *u);
+char *unit_dbus_path_invocation_id(Unit *u);
+
+int unit_load_related_unit(Unit *u, const char *type, Unit **_found);
+
+int unit_add_node_dependency(Unit *u, const char *what, UnitDependency d, UnitDependencyMask mask);
+int unit_add_blockdev_dependency(Unit *u, const char *what, UnitDependencyMask mask);
+
+int unit_coldplug(Unit *u);
+void unit_catchup(Unit *u);
+
+void unit_status_printf(Unit *u, StatusType status_type, const char *status, const char *format, const char *ident) _printf_(4, 0);
+
+bool unit_need_daemon_reload(Unit *u);
+
+void unit_reset_failed(Unit *u);
+
+Unit *unit_following(Unit *u);
+int unit_following_set(Unit *u, Set **s);
+
+const char *unit_slice_name(Unit *u);
+
+bool unit_stop_pending(Unit *u) _pure_;
+bool unit_inactive_or_pending(Unit *u) _pure_;
+bool unit_active_or_pending(Unit *u);
+bool unit_will_restart_default(Unit *u);
+bool unit_will_restart(Unit *u);
+
+int unit_add_default_target_dependency(Unit *u, Unit *target);
+
+void unit_start_on_failure(Unit *u, const char *dependency_name, UnitDependencyAtom atom, JobMode job_mode);
+void unit_trigger_notify(Unit *u);
+
+UnitFileState unit_get_unit_file_state(Unit *u);
+PresetAction unit_get_unit_file_preset(Unit *u);
+
+Unit* unit_ref_set(UnitRef *ref, Unit *source, Unit *target);
+void unit_ref_unset(UnitRef *ref);
+
+#define UNIT_DEREF(ref) ((ref).target)
+#define UNIT_ISSET(ref) (!!(ref).target)
+
+int unit_patch_contexts(Unit *u);
+
+ExecContext *unit_get_exec_context(const Unit *u) _pure_;
+KillContext *unit_get_kill_context(Unit *u) _pure_;
+CGroupContext *unit_get_cgroup_context(Unit *u) _pure_;
+
+ExecRuntime *unit_get_exec_runtime(Unit *u) _pure_;
+
+int unit_setup_exec_runtime(Unit *u);
+
+const char* unit_escape_setting(const char *s, UnitWriteFlags flags, char **buf);
+char* unit_concat_strv(char **l, UnitWriteFlags flags);
+
+int unit_write_setting(Unit *u, UnitWriteFlags flags, const char *name, const char *data);
+int unit_write_settingf(Unit *u, UnitWriteFlags mode, const char *name, const char *format, ...) _printf_(4,5);
+
+int unit_kill_context(Unit *u, KillContext *c, KillOperation k, PidRef *main_pid, PidRef *control_pid, bool main_pid_alien);
+
+int unit_make_transient(Unit *u);
+
+int unit_require_mounts_for(Unit *u, const char *path, UnitDependencyMask mask);
+
+bool unit_type_supported(UnitType t);
+
+bool unit_is_pristine(Unit *u);
+
+bool unit_is_unneeded(Unit *u);
+bool unit_is_upheld_by_active(Unit *u, Unit **ret_culprit);
+bool unit_is_bound_by_inactive(Unit *u, Unit **ret_culprit);
+
+PidRef* unit_control_pid(Unit *u);
+PidRef* unit_main_pid(Unit *u);
+
+void unit_warn_if_dir_nonempty(Unit *u, const char* where);
+int unit_fail_if_noncanonical(Unit *u, const char* where);
+
+int unit_test_start_limit(Unit *u);
+
+int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid);
+void unit_unref_uid_gid(Unit *u, bool destroy_now);
+
+void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid);
+
+int unit_set_invocation_id(Unit *u, sd_id128_t id);
+int unit_acquire_invocation_id(Unit *u);
+
+int unit_set_exec_params(Unit *s, ExecParameters *p);
+
+int unit_fork_helper_process(Unit *u, const char *name, PidRef *ret);
+int unit_fork_and_watch_rm_rf(Unit *u, char **paths, PidRef *ret);
+
+void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
+
+void unit_export_state_files(Unit *u);
+void unit_unlink_state_files(Unit *u);
+
+int unit_prepare_exec(Unit *u);
+
+int unit_log_leftover_process_start(const PidRef* pid, int sig, void *userdata);
+int unit_log_leftover_process_stop(const PidRef* pid, int sig, void *userdata);
+
+int unit_warn_leftover_processes(Unit *u, cg_kill_log_func_t log_func);
+
+bool unit_needs_console(Unit *u);
+
+int unit_pid_attachable(Unit *unit, PidRef *pid, sd_bus_error *error);
+
+static inline bool unit_has_job_type(Unit *u, JobType type) {
+ return u && u->job && u->job->type == type;
+}
+
+static inline bool unit_log_level_test(const Unit *u, int level) {
+ ExecContext *ec = unit_get_exec_context(u);
+ return !ec || ec->log_level_max < 0 || ec->log_level_max >= LOG_PRI(level);
+}
+
+/* unit_log_skip is for cases like ExecCondition= where a unit is considered "done"
+ * after some execution, rather than succeeded or failed. */
+void unit_log_skip(Unit *u, const char *result);
+void unit_log_success(Unit *u);
+void unit_log_failure(Unit *u, const char *result);
+static inline void unit_log_result(Unit *u, bool success, const char *result) {
+ if (success)
+ unit_log_success(u);
+ else
+ unit_log_failure(u, result);
+}
+
+void unit_log_process_exit(Unit *u, const char *kind, const char *command, bool success, int code, int status);
+
+int unit_exit_status(Unit *u);
+int unit_success_action_exit_status(Unit *u);
+int unit_failure_action_exit_status(Unit *u);
+
+int unit_test_trigger_loaded(Unit *u);
+
+void unit_destroy_runtime_data(Unit *u, const ExecContext *context);
+int unit_clean(Unit *u, ExecCleanMask mask);
+int unit_can_clean(Unit *u, ExecCleanMask *ret_mask);
+
+bool unit_can_start_refuse_manual(Unit *u);
+bool unit_can_stop_refuse_manual(Unit *u);
+bool unit_can_isolate_refuse_manual(Unit *u);
+
+bool unit_can_freeze(Unit *u);
+int unit_freeze(Unit *u);
+void unit_frozen(Unit *u);
+
+int unit_thaw(Unit *u);
+void unit_thawed(Unit *u);
+
+int unit_freeze_vtable_common(Unit *u);
+int unit_thaw_vtable_common(Unit *u);
+
+Condition *unit_find_failed_condition(Unit *u);
+
+int unit_arm_timer(Unit *u, sd_event_source **source, bool relative, usec_t usec, sd_event_time_handler_t handler);
+
+int unit_compare_priority(Unit *a, Unit *b);
+
+/* Macros which append UNIT= or USER_UNIT= to the message */
+
+#define log_unit_full_errno_zerook(unit, level, error, ...) \
+ ({ \
+ const Unit *_u = (unit); \
+ const int _l = (level); \
+ bool _do_log = !(log_get_max_level() < LOG_PRI(_l) || \
+ (_u && !unit_log_level_test(_u, _l))); \
+ const ExecContext *_c = _do_log && _u ? \
+ unit_get_exec_context(_u) : NULL; \
+ LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \
+ _c ? _c->n_log_extra_fields : 0); \
+ !_do_log ? -ERRNO_VALUE(error) : \
+ _u ? log_object_internal(_l, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
+ log_internal(_l, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
+ })
+
+#define log_unit_full_errno(unit, level, error, ...) \
+ ({ \
+ int _error = (error); \
+ ASSERT_NON_ZERO(_error); \
+ log_unit_full_errno_zerook(unit, level, _error, ##__VA_ARGS__); \
+ })
+
+#define log_unit_full(unit, level, ...) (void) log_unit_full_errno_zerook(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_debug(unit, ...) log_unit_full(unit, LOG_DEBUG, __VA_ARGS__)
+#define log_unit_info(unit, ...) log_unit_full(unit, LOG_INFO, __VA_ARGS__)
+#define log_unit_notice(unit, ...) log_unit_full(unit, LOG_NOTICE, __VA_ARGS__)
+#define log_unit_warning(unit, ...) log_unit_full(unit, LOG_WARNING, __VA_ARGS__)
+#define log_unit_error(unit, ...) log_unit_full(unit, LOG_ERR, __VA_ARGS__)
+
+#define log_unit_debug_errno(unit, error, ...) log_unit_full_errno(unit, LOG_DEBUG, error, __VA_ARGS__)
+#define log_unit_info_errno(unit, error, ...) log_unit_full_errno(unit, LOG_INFO, error, __VA_ARGS__)
+#define log_unit_notice_errno(unit, error, ...) log_unit_full_errno(unit, LOG_NOTICE, error, __VA_ARGS__)
+#define log_unit_warning_errno(unit, error, ...) log_unit_full_errno(unit, LOG_WARNING, error, __VA_ARGS__)
+#define log_unit_error_errno(unit, error, ...) log_unit_full_errno(unit, LOG_ERR, error, __VA_ARGS__)
+
+#if LOG_TRACE
+# define log_unit_trace(...) log_unit_debug(__VA_ARGS__)
+# define log_unit_trace_errno(...) log_unit_debug_errno(__VA_ARGS__)
+#else
+# define log_unit_trace(...) do {} while (0)
+# define log_unit_trace_errno(e, ...) (-ERRNO_VALUE(e))
+#endif
+
+#define log_unit_struct_errno(unit, level, error, ...) \
+ ({ \
+ const Unit *_u = (unit); \
+ const int _l = (level); \
+ bool _do_log = unit_log_level_test(_u, _l); \
+ const ExecContext *_c = _do_log && _u ? \
+ unit_get_exec_context(_u) : NULL; \
+ LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \
+ _c ? _c->n_log_extra_fields : 0); \
+ _do_log ? \
+ log_struct_errno(_l, error, __VA_ARGS__, LOG_UNIT_ID(_u)) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_unit_struct(unit, level, ...) log_unit_struct_errno(unit, level, 0, __VA_ARGS__)
+
+#define log_unit_struct_iovec_errno(unit, level, error, iovec, n_iovec) \
+ ({ \
+ const Unit *_u = (unit); \
+ const int _l = (level); \
+ bool _do_log = unit_log_level_test(_u, _l); \
+ const ExecContext *_c = _do_log && _u ? \
+ unit_get_exec_context(_u) : NULL; \
+ LOG_CONTEXT_PUSH_IOV(_c ? _c->log_extra_fields : NULL, \
+ _c ? _c->n_log_extra_fields : 0); \
+ _do_log ? \
+ log_struct_iovec_errno(_l, error, iovec, n_iovec) : \
+ -ERRNO_VALUE(error); \
+ })
+
+#define log_unit_struct_iovec(unit, level, iovec, n_iovec) log_unit_struct_iovec_errno(unit, level, 0, iovec, n_iovec)
+
+/* Like LOG_MESSAGE(), but with the unit name prefixed. */
+#define LOG_UNIT_MESSAGE(unit, fmt, ...) LOG_MESSAGE("%s: " fmt, (unit)->id, ##__VA_ARGS__)
+#define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id
+#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string
+
+const char* collect_mode_to_string(CollectMode m) _const_;
+CollectMode collect_mode_from_string(const char *s) _pure_;
+
+typedef struct UnitForEachDependencyData {
+ /* Stores state for the FOREACH macro below for iterating through all deps that have any of the
+ * specified dependency atom bits set */
+ UnitDependencyAtom match_atom;
+ Hashmap *by_type, *by_unit;
+ void *current_type;
+ Iterator by_type_iterator, by_unit_iterator;
+ Unit **current_unit;
+} UnitForEachDependencyData;
+
+/* Iterates through all dependencies that have a specific atom in the dependency type set. This tries to be
+ * smart: if the atom is unique, we'll directly go to right entry. Otherwise we'll iterate through the
+ * per-dependency type hashmap and match all dep that have the right atom set. */
+#define _UNIT_FOREACH_DEPENDENCY(other, u, ma, data) \
+ for (UnitForEachDependencyData data = { \
+ .match_atom = (ma), \
+ .by_type = (u)->dependencies, \
+ .by_type_iterator = ITERATOR_FIRST, \
+ .current_unit = &(other), \
+ }; \
+ ({ \
+ UnitDependency _dt = _UNIT_DEPENDENCY_INVALID; \
+ bool _found; \
+ \
+ if (data.by_type && ITERATOR_IS_FIRST(data.by_type_iterator)) { \
+ _dt = unit_dependency_from_unique_atom(data.match_atom); \
+ if (_dt >= 0) { \
+ data.by_unit = hashmap_get(data.by_type, UNIT_DEPENDENCY_TO_PTR(_dt)); \
+ data.current_type = UNIT_DEPENDENCY_TO_PTR(_dt); \
+ data.by_type = NULL; \
+ _found = !!data.by_unit; \
+ } \
+ } \
+ if (_dt < 0) \
+ _found = hashmap_iterate(data.by_type, \
+ &data.by_type_iterator, \
+ (void**)&(data.by_unit), \
+ (const void**) &(data.current_type)); \
+ _found; \
+ }); ) \
+ if ((unit_dependency_to_atom(UNIT_DEPENDENCY_FROM_PTR(data.current_type)) & data.match_atom) != 0) \
+ for (data.by_unit_iterator = ITERATOR_FIRST; \
+ hashmap_iterate(data.by_unit, \
+ &data.by_unit_iterator, \
+ NULL, \
+ (const void**) data.current_unit); )
+
+/* Note: this matches deps that have *any* of the atoms specified in match_atom set */
+#define UNIT_FOREACH_DEPENDENCY(other, u, match_atom) \
+ _UNIT_FOREACH_DEPENDENCY(other, u, match_atom, UNIQ_T(data, UNIQ))
+
+#define _LOG_CONTEXT_PUSH_UNIT(unit, u, c) \
+ const Unit *u = (unit); \
+ const ExecContext *c = unit_get_exec_context(u); \
+ LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->unit_log_field, u->id); \
+ LOG_CONTEXT_PUSH_KEY_VALUE(u->manager->invocation_log_field, u->invocation_id_string); \
+ LOG_CONTEXT_PUSH_IOV(c ? c->log_extra_fields : NULL, c ? c->n_log_extra_fields : 0)
+
+#define LOG_CONTEXT_PUSH_UNIT(unit) \
+ _LOG_CONTEXT_PUSH_UNIT(unit, UNIQ_T(u, UNIQ), UNIQ_T(c, UNIQ))
diff --git a/src/core/user.conf.in b/src/core/user.conf.in
new file mode 100644
index 0000000..14f0eae
--- /dev/null
+++ b/src/core/user.conf.in
@@ -0,0 +1,59 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# Entries in this file show the compile time defaults. Local configuration
+# should be created by either modifying this file (or a copy of it placed in
+# /etc/ if the original file is shipped in /usr/), or by creating "drop-ins" in
+# the /etc/systemd/user.conf.d/ directory. The latter is generally recommended.
+# Defaults can be restored by simply deleting the main configuration file and
+# all drop-ins located in /etc/.
+#
+# Use 'systemd-analyze cat-config systemd/user.conf' to display the full config.
+#
+# See systemd-user.conf(5) for details.
+
+[Manager]
+#LogLevel=info
+#LogTarget=auto
+#LogColor=yes
+#LogLocation=no
+#LogTime=no
+#SystemCallArchitectures=
+#TimerSlackNSec=
+#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
+#DefaultTimerAccuracySec=1min
+#DefaultStandardOutput=inherit
+#DefaultStandardError=inherit
+#DefaultTimeoutStartSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultTimeoutStopSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultTimeoutAbortSec=
+#DefaultDeviceTimeoutSec={{DEFAULT_USER_TIMEOUT_SEC}}s
+#DefaultRestartSec=100ms
+#DefaultStartLimitIntervalSec=10s
+#DefaultStartLimitBurst=5
+#DefaultEnvironment=
+#DefaultLimitCPU=
+#DefaultLimitFSIZE=
+#DefaultLimitDATA=
+#DefaultLimitSTACK=
+#DefaultLimitCORE=
+#DefaultLimitRSS=
+#DefaultLimitNOFILE=
+#DefaultLimitAS=
+#DefaultLimitNPROC=
+#DefaultLimitMEMLOCK=
+#DefaultLimitLOCKS=
+#DefaultLimitSIGPENDING=
+#DefaultLimitMSGQUEUE=
+#DefaultLimitNICE=
+#DefaultLimitRTPRIO=
+#DefaultLimitRTTIME=
+#DefaultMemoryPressureThresholdSec=200ms
+#DefaultMemoryPressureWatch=auto
+#DefaultSmackProcessLabel=
+#ReloadLimitIntervalSec=
+#ReloadLimitBurst